test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

   1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
   2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
   3
   4 ; SI-LABEL: {{^}}load_i8_to_f32:
   5 ; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
   6 ; SI-NOT: bfe
   7 ; SI-NOT: lshr
   8 ; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
   9 ; SI: buffer_store_dword [[CONV]],
  10 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
  11   %load = load i8, i8 addrspace(1)* %in, align 1
  12   %cvt = uitofp i8 %load to float
  13   store float %cvt, float addrspace(1)* %out, align 4
  14   ret void
  15 }
  16
  17 ; SI-LABEL: {{^}}load_v2i8_to_v2f32:
  18 ; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
  19 ; SI-NOT: bfe
  20 ; SI-NOT: lshr
  21 ; SI-NOT: and
  22 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
  23 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  24 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  25 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
  26   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
  27   %cvt = uitofp <2 x i8> %load to <2 x float>
  28   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
  29   ret void
  30 }
  31
  32 ; SI-LABEL: {{^}}load_v3i8_to_v3f32:
  33 ; SI-NOT: bfe
  34 ; SI-NOT: v_cvt_f32_ubyte3_e32
  35 ; SI-DAG: v_cvt_f32_ubyte2_e32
  36 ; SI-DAG: v_cvt_f32_ubyte1_e32
  37 ; SI-DAG: v_cvt_f32_ubyte0_e32
  38 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  39 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
  40   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
  41   %cvt = uitofp <3 x i8> %load to <3 x float>
  42   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
  43   ret void
  44 }
  45
  46 ; SI-LABEL: {{^}}load_v4i8_to_v4f32:
  47 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
  48 ; SI-NOT: bfe
  49 ; SI-NOT: lshr
  50 ; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
  51 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
  52 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
  53 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  54 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  55 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  56   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  57   %cvt = uitofp <4 x i8> %load to <4 x float>
  58   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
  59   ret void
  60 }
  61
  62 ; This should not be adding instructions to shift into the correct
  63 ; position in the word for the component.
  64
  65 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
  66 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
  67 ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
  68 ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
  69 ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
  70 ; SI-NOT: v_lshlrev_b32
  71 ; SI-NOT: v_or_b32
  72
  73 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
  74 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
  75 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
  76 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
  77
  78 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  79 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  80   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
  81   %cvt = uitofp <4 x i8> %load to <4 x float>
  82   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
  83   ret void
  84 }
  85
  86 ; XXX - This should really still be able to use the v_cvt_f32_ubyte0
  87 ; for each component, but computeKnownBits doesn't handle vectors very
  88 ; well.
  89
  90 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
  91 ; SI: buffer_load_ubyte
  92 ; SI: buffer_load_ubyte
  93 ; SI: buffer_load_ubyte
  94 ; SI: buffer_load_ubyte
  95 ; SI: v_cvt_f32_ubyte0_e32
  96 ; SI: v_cvt_f32_ubyte0_e32
  97 ; SI: v_cvt_f32_ubyte0_e32
  98 ; SI: v_cvt_f32_ubyte0_e32
  99
 100 ; XXX - replace with this when v4i8 loads aren't scalarized anymore.
 101 ; XSI: buffer_load_dword
 102 ; XSI: v_cvt_f32_u32_e32
 103 ; XSI: v_cvt_f32_u32_e32
 104 ; XSI: v_cvt_f32_u32_e32
 105 ; XSI: v_cvt_f32_u32_e32
 106 ; SI: s_endpgm
 107 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
 108   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
 109   %cvt = uitofp <4 x i8> %load to <4 x float>
 110   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
 111   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
 112   store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
 113   ret void
 114 }
 115
 116 ; Make sure this doesn't crash.
 117 ; SI-LABEL: {{^}}load_v7i8_to_v7f32:
 118 ; SI: s_endpgm
 119 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
 120   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
 121   %cvt = uitofp <7 x i8> %load to <7 x float>
 122   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
 123   ret void
 124 }
 125
 126 ; SI-LABEL: {{^}}load_v8i8_to_v8f32:
 127 ; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
 128 ; SI-NOT: bfe
 129 ; SI-NOT: lshr
 130 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
 131 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
 132 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
 133 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
 134 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
 135 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
 136 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
 137 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
 138 ; SI-NOT: bfe
 139 ; SI-NOT: lshr
 140 ; SI: buffer_store_dword
 141 ; SI: buffer_store_dword
 142 ; SI: buffer_store_dword
 143 ; SI: buffer_store_dword
 144 ; SI: buffer_store_dword
 145 ; SI: buffer_store_dword
 146 ; SI: buffer_store_dword
 147 ; SI: buffer_store_dword
 148 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
 149   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
 150   %cvt = uitofp <8 x i8> %load to <8 x float>
 151   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
 152   ret void
 153 }
 154
 155 ; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
 156 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
 157 ; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 158 ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 159 ; SI: buffer_store_dword [[CONV]],
 160 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
 161   %load = load i32, i32 addrspace(1)* %in, align 4
 162   %add = add i32 %load, 2
 163   %inreg = and i32 %add, 255
 164   %cvt = uitofp i32 %inreg to float
 165   store float %cvt, float addrspace(1)* %out, align 4
 166   ret void
 167 }
 168
 169 ; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
 170 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
 171   %load = load i32, i32 addrspace(1)* %in, align 4
 172   %inreg = and i32 %load, 65280
 173   %shr = lshr i32 %inreg, 8
 174   %cvt = uitofp i32 %shr to float
 175   store float %cvt, float addrspace(1)* %out, align 4
 176   ret void
 177 }
 178
 179
 180 ; We don't get these ones because of the zext, but instcombine removes
 181 ; them so it shouldn't really matter.
 182 define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
 183   %load = load i8, i8 addrspace(1)* %in, align 1
 184   %ext = zext i8 %load to i32
 185   %cvt = uitofp i32 %ext to float
 186   store float %cvt, float addrspace(1)* %out, align 4
 187   ret void
 188 }
 189
 190 define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
 191   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
 192   %ext = zext <4 x i8> %load to <4 x i32>
 193   %cvt = uitofp <4 x i32> %ext to <4 x float>
 194   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
 195   ret void
 196 }