1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
4 declare i32 @llvm.r600.read.tidig.x() #0
5 declare double @llvm.fabs.f64(double) #0
6 declare double @llvm.fma.f64(double, double, double) #0
7 declare float @llvm.fma.f32(float, float, float) #0
9 ; (fadd (fmul x, y), z) -> (fma x, y, z)
10 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
11 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
12 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
13 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
14 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
15 ; SI: buffer_store_dwordx2 [[RESULT]]
16 define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
17 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
18 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
19 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
20 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
21 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
23 %a = load double, double addrspace(1)* %gep.0
24 %b = load double, double addrspace(1)* %gep.1
25 %c = load double, double addrspace(1)* %gep.2
27 %mul = fmul double %a, %b
28 %fma = fadd double %mul, %c
29 store double %fma, double addrspace(1)* %gep.out
33 ; (fadd (fmul x, y), z) -> (fma x, y, z)
34 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
35 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
36 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
37 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
38 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
39 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
40 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
41 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
42 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
44 define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
45 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
46 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
47 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
48 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
49 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
50 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
51 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
53 %a = load double, double addrspace(1)* %gep.0
54 %b = load double, double addrspace(1)* %gep.1
55 %c = load double, double addrspace(1)* %gep.2
56 %d = load double, double addrspace(1)* %gep.3
58 %mul = fmul double %a, %b
59 %fma0 = fadd double %mul, %c
60 %fma1 = fadd double %mul, %d
61 store double %fma0, double addrspace(1)* %gep.out.0
62 store double %fma1, double addrspace(1)* %gep.out.1
66 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
67 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
68 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
69 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
70 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
71 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
72 ; SI: buffer_store_dwordx2 [[RESULT]]
73 define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
74 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
75 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
76 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
77 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
78 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
80 %a = load double, double addrspace(1)* %gep.0
81 %b = load double, double addrspace(1)* %gep.1
82 %c = load double, double addrspace(1)* %gep.2
84 %mul = fmul double %a, %b
85 %fma = fadd double %c, %mul
86 store double %fma, double addrspace(1)* %gep.out
90 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
91 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
92 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
93 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
94 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
95 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
96 ; SI: buffer_store_dwordx2 [[RESULT]]
97 define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
98 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
99 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
100 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
101 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
102 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
104 %a = load double, double addrspace(1)* %gep.0
105 %b = load double, double addrspace(1)* %gep.1
106 %c = load double, double addrspace(1)* %gep.2
108 %mul = fmul double %a, %b
109 %fma = fsub double %mul, %c
110 store double %fma, double addrspace(1)* %gep.out
114 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
115 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
116 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
117 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
118 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
119 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
120 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
121 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
122 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
123 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
125 define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
126 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
127 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
128 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
129 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
130 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
131 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
132 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
134 %a = load double, double addrspace(1)* %gep.0
135 %b = load double, double addrspace(1)* %gep.1
136 %c = load double, double addrspace(1)* %gep.2
137 %d = load double, double addrspace(1)* %gep.3
139 %mul = fmul double %a, %b
140 %fma0 = fsub double %mul, %c
141 %fma1 = fsub double %mul, %d
142 store double %fma0, double addrspace(1)* %gep.out.0
143 store double %fma1, double addrspace(1)* %gep.out.1
147 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
148 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
149 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
150 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
151 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
152 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
153 ; SI: buffer_store_dwordx2 [[RESULT]]
154 define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
155 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
156 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
157 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
158 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
159 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
161 %a = load double, double addrspace(1)* %gep.0
162 %b = load double, double addrspace(1)* %gep.1
163 %c = load double, double addrspace(1)* %gep.2
165 %mul = fmul double %a, %b
166 %fma = fsub double %c, %mul
167 store double %fma, double addrspace(1)* %gep.out
171 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
172 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
173 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
174 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
175 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
176 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
177 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
178 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
179 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
180 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
182 define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
183 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
184 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
185 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
186 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
187 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
188 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
189 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
191 %a = load double, double addrspace(1)* %gep.0
192 %b = load double, double addrspace(1)* %gep.1
193 %c = load double, double addrspace(1)* %gep.2
194 %d = load double, double addrspace(1)* %gep.3
196 %mul = fmul double %a, %b
197 %fma0 = fsub double %c, %mul
198 %fma1 = fsub double %d, %mul
199 store double %fma0, double addrspace(1)* %gep.out.0
200 store double %fma1, double addrspace(1)* %gep.out.1
204 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
205 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
206 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
207 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
208 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
209 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
210 ; SI: buffer_store_dwordx2 [[RESULT]]
211 define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
212 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
213 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
214 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
215 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
216 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
218 %a = load double, double addrspace(1)* %gep.0
219 %b = load double, double addrspace(1)* %gep.1
220 %c = load double, double addrspace(1)* %gep.2
222 %mul = fmul double %a, %b
223 %mul.neg = fsub double -0.0, %mul
224 %fma = fsub double %mul.neg, %c
226 store double %fma, double addrspace(1)* %gep.out
230 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
231 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
232 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
233 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
234 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
235 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
236 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
237 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
240 define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
241 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
242 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
243 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
244 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
245 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
246 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
247 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
249 %a = load double, double addrspace(1)* %gep.0
250 %b = load double, double addrspace(1)* %gep.1
251 %c = load double, double addrspace(1)* %gep.2
252 %d = load double, double addrspace(1)* %gep.3
254 %mul = fmul double %a, %b
255 %mul.neg = fsub double -0.0, %mul
256 %fma0 = fsub double %mul.neg, %c
257 %fma1 = fsub double %mul.neg, %d
259 store double %fma0, double addrspace(1)* %gep.out.0
260 store double %fma1, double addrspace(1)* %gep.out.1
264 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
265 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
266 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
267 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
268 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
269 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
270 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
271 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
272 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
274 define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
275 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
276 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
277 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
278 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
279 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
280 %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
281 %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
283 %a = load double, double addrspace(1)* %gep.0
284 %b = load double, double addrspace(1)* %gep.1
285 %c = load double, double addrspace(1)* %gep.2
286 %d = load double, double addrspace(1)* %gep.3
288 %mul = fmul double %a, %b
289 %mul.neg = fsub double -0.0, %mul
290 %fma0 = fsub double %mul.neg, %c
291 %fma1 = fsub double %mul, %d
293 store double %fma0, double addrspace(1)* %gep.out.0
294 store double %fma1, double addrspace(1)* %gep.out.1
298 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
300 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
301 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
302 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
303 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
304 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
305 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
306 ; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
307 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
308 ; SI: buffer_store_dwordx2 [[RESULT]]
309 define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
310 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
311 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
312 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
313 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
314 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
315 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
316 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
318 %x = load double, double addrspace(1)* %gep.0
319 %y = load double, double addrspace(1)* %gep.1
320 %z = load double, double addrspace(1)* %gep.2
321 %u = load double, double addrspace(1)* %gep.3
322 %v = load double, double addrspace(1)* %gep.4
324 %tmp0 = fmul double %u, %v
325 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
326 %tmp2 = fsub double %tmp1, %z
328 store double %tmp2, double addrspace(1)* %gep.out
332 ; fold (fsub x, (fma y, z, (fmul u, v)))
333 ; -> (fma (fneg y), z, (fma (fneg u), v, x))
335 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
336 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
337 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
338 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
339 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
340 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
341 ; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
342 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
343 ; SI: buffer_store_dwordx2 [[RESULT]]
344 define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
345 %tid = tail call i32 @llvm.r600.read.tidig.x() #0
346 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
347 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
348 %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
349 %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
350 %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
351 %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
353 %x = load double, double addrspace(1)* %gep.0
354 %y = load double, double addrspace(1)* %gep.1
355 %z = load double, double addrspace(1)* %gep.2
356 %u = load double, double addrspace(1)* %gep.3
357 %v = load double, double addrspace(1)* %gep.4
359 %tmp0 = fmul double %u, %v
360 %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
361 %tmp2 = fsub double %x, %tmp1
363 store double %tmp2, double addrspace(1)* %gep.out
367 attributes #0 = { nounwind readnone }
368 attributes #1 = { nounwind }