R600/SI: Try to use v_madak_f32

[oota-llvm.git] / test / CodeGen / R600 / cvt_f32_ubyte.ll
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll

index 90d09d6a807518eb762a18712d56908e6c5d1887..4d4bf934d0d8fa454fce6b45fc935fd78f847180 100644 (file)
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -1,4 +1,5 @@
  ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
  
  ; SI-LABEL: {{^}}load_i8_to_f32:
  ; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
@@ -22,7 +23,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
  ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in, align 1
+  %load = load <2 x i8> addrspace(1)* %in, align 2
    %cvt = uitofp <2 x i8> %load to <2 x float>
    store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
    ret void
@@ -36,18 +37,14 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
  ; SI-DAG: v_cvt_f32_ubyte0_e32
  ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <3 x i8> addrspace(1)* %in, align 1
+  %load = load <3 x i8> addrspace(1)* %in, align 4
    %cvt = uitofp <3 x i8> %load to <3 x float>
    store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
    ret void
  }
  
  ; SI-LABEL: {{^}}load_v4i8_to_v4f32:
-; We can't use buffer_load_dword here, because the load is byte aligned, and
-; buffer_load_dword requires dword alignment.
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: v_or_b32_e32 [[LOADREG:v[0-9]+]]
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
  ; SI-NOT: bfe
  ; SI-NOT: lshr
  ; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
@@ -56,6 +53,30 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
  ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; This should not be adding instructions to shift into the correct
+; position in the word for the component.
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
+; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; SI-NOT: v_lshlrev_b32
+; SI-NOT: v_or_b32
+
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
+
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
    %load = load <4 x i8> addrspace(1)* %in, align 1
    %cvt = uitofp <4 x i8> %load to <4 x float>
    store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -125,7 +146,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
  ; SI: buffer_store_dword
  ; SI: buffer_store_dword
  define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in, align 1
+  %load = load <8 x i8> addrspace(1)* %in, align 8
    %cvt = uitofp <8 x i8> %load to <8 x float>
    store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
    ret void