nir: Use b2b opcodes for shared and constant memory
[mesa.git] / src / compiler / nir / nir_lower_amul.c
1 /*
2 * Copyright © 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_vla.h"
26
27 /* Lowering for amul instructions, for drivers that support imul24.
28 * This pass will analyze indirect derefs, and convert corresponding
29 * amul instructions to either imul or imul24, depending on the
30 * required range.
31 *
32 * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
33 * that are either too large, or might be too large (unknown size)
34 * for imul24
35 *
36 * 2) Loop thru looking at all the intrinsics, finding dereferences of
37 * large variables, and recursively replacing all amul instructions
38 * used with imul
39 *
40 * 3) Finally loop again thru all instructions replacing any remaining
41 * amul with imul24. At this point any remaining amul instructions
42 * are not involved in calculating an offset into a large variable,
43 * thanks to the 2nd step, so they can be safely replace with imul24.
44 *
45 * Using two passes over all the instructions lets us handle the case
46 * where, due to CSE, an amul is used to calculate an offset into both
47 * a large and small variable.
48 */
49
50 typedef struct {
51 int (*type_size)(const struct glsl_type *, bool);
52
53 /* Tables of UBOs and SSBOs mapping driver_location/base whether
54 * they are too large to use imul24:
55 */
56 bool *large_ubos;
57 bool *large_ssbos;
58
59 /* for cases that we cannot determine UBO/SSBO index, track if *any*
60 * UBO/SSBO is too large for imul24:
61 */
62 bool has_large_ubo;
63 bool has_large_ssbo;
64 } lower_state;
65
66 /* Lower 'amul's in offset src of large variables to 'imul': */
67 static bool
68 lower_large_src(nir_src *src, void *s)
69 {
70 lower_state *state = s;
71
72 assert(src->is_ssa);
73
74 nir_instr *parent = src->ssa->parent_instr;
75
76 /* No need to visit instructions we've already visited.. this also
77 * avoids infinite recursion when phi's are involved:
78 */
79 if (parent->pass_flags)
80 return false;
81
82 bool progress = nir_foreach_src(parent, lower_large_src, state);
83
84 if (parent->type == nir_instr_type_alu) {
85 nir_alu_instr *alu = nir_instr_as_alu(parent);
86 if (alu->op == nir_op_amul) {
87 alu->op = nir_op_imul;
88 progress = true;
89 }
90 }
91
92 parent->pass_flags = 1;
93
94 return progress;
95 }
96
97 static bool
98 large_ubo(lower_state *state, nir_src src)
99 {
100 if (!nir_src_is_const(src))
101 return state->has_large_ubo;
102 return state->large_ubos[nir_src_as_uint(src)];
103 }
104
105 static bool
106 large_ssbo(lower_state *state, nir_src src)
107 {
108 if (!nir_src_is_const(src))
109 return state->has_large_ssbo;
110 return state->large_ssbos[nir_src_as_uint(src)];
111 }
112
113 static bool
114 lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
115 {
116 switch (intr->intrinsic) {
117 case nir_intrinsic_load_ubo:
118 //# src[] = { buffer_index, offset }.
119 if (large_ubo(state, intr->src[0]))
120 return lower_large_src(&intr->src[1], state);
121 return false;
122
123 case nir_intrinsic_load_ssbo:
124 //# src[] = { buffer_index, offset }.
125 if (large_ssbo(state, intr->src[0]))
126 return lower_large_src(&intr->src[1], state);
127 return false;
128
129 case nir_intrinsic_store_ssbo:
130 //# src[] = { value, block_index, offset }
131 if (large_ssbo(state, intr->src[1]))
132 return lower_large_src(&intr->src[2], state);
133 return false;
134
135 case nir_intrinsic_ssbo_atomic_add:
136 case nir_intrinsic_ssbo_atomic_imin:
137 case nir_intrinsic_ssbo_atomic_umin:
138 case nir_intrinsic_ssbo_atomic_imax:
139 case nir_intrinsic_ssbo_atomic_umax:
140 case nir_intrinsic_ssbo_atomic_and:
141 case nir_intrinsic_ssbo_atomic_or:
142 case nir_intrinsic_ssbo_atomic_xor:
143 case nir_intrinsic_ssbo_atomic_exchange:
144 case nir_intrinsic_ssbo_atomic_comp_swap:
145 case nir_intrinsic_ssbo_atomic_fadd:
146 case nir_intrinsic_ssbo_atomic_fmin:
147 case nir_intrinsic_ssbo_atomic_fmax:
148 case nir_intrinsic_ssbo_atomic_fcomp_swap:
149 /* 0: SSBO index
150 * 1: offset
151 */
152 if (large_ssbo(state, intr->src[0]))
153 return lower_large_src(&intr->src[1], state);
154 return false;
155
156 case nir_intrinsic_global_atomic_add:
157 case nir_intrinsic_global_atomic_imin:
158 case nir_intrinsic_global_atomic_umin:
159 case nir_intrinsic_global_atomic_imax:
160 case nir_intrinsic_global_atomic_umax:
161 case nir_intrinsic_global_atomic_and:
162 case nir_intrinsic_global_atomic_or:
163 case nir_intrinsic_global_atomic_xor:
164 case nir_intrinsic_global_atomic_exchange:
165 case nir_intrinsic_global_atomic_comp_swap:
166 case nir_intrinsic_global_atomic_fadd:
167 case nir_intrinsic_global_atomic_fmin:
168 case nir_intrinsic_global_atomic_fmax:
169 case nir_intrinsic_global_atomic_fcomp_swap:
170 /* just assume we that 24b is not sufficient: */
171 return lower_large_src(&intr->src[0], state);
172
173 /* These should all be small enough to unconditionally use imul24: */
174 case nir_intrinsic_shared_atomic_add:
175 case nir_intrinsic_shared_atomic_imin:
176 case nir_intrinsic_shared_atomic_umin:
177 case nir_intrinsic_shared_atomic_imax:
178 case nir_intrinsic_shared_atomic_umax:
179 case nir_intrinsic_shared_atomic_and:
180 case nir_intrinsic_shared_atomic_or:
181 case nir_intrinsic_shared_atomic_xor:
182 case nir_intrinsic_shared_atomic_exchange:
183 case nir_intrinsic_shared_atomic_comp_swap:
184 case nir_intrinsic_shared_atomic_fadd:
185 case nir_intrinsic_shared_atomic_fmin:
186 case nir_intrinsic_shared_atomic_fmax:
187 case nir_intrinsic_shared_atomic_fcomp_swap:
188 case nir_intrinsic_load_uniform:
189 case nir_intrinsic_load_input:
190 case nir_intrinsic_load_output:
191 case nir_intrinsic_store_output:
192 default:
193 return false;
194 }
195 }
196
197 static bool
198 lower_instr(lower_state *state, nir_instr *instr)
199 {
200 bool progress = false;
201
202 if (instr->type == nir_instr_type_intrinsic) {
203 progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr));
204 }
205
206 return progress;
207 }
208
209 static bool
210 is_large(lower_state *state, nir_variable *var)
211 {
212 unsigned size = state->type_size(var->type, false);
213
214 /* if size is not known (ie. VLA) then assume the worst: */
215 if (!size)
216 return true;
217
218 return size >= (1 << 23);
219 }
220
221 bool
222 nir_lower_amul(nir_shader *shader,
223 int (*type_size)(const struct glsl_type *, bool))
224 {
225 assert(shader->options->has_imul24);
226 assert(type_size);
227
228 /* uniforms list actually includes ubo's and ssbo's: */
229 int num_uniforms = exec_list_length(&shader->uniforms);
230
231 NIR_VLA_FILL(bool, large_ubos, num_uniforms, 0);
232 NIR_VLA_FILL(bool, large_ssbos, num_uniforms, 0);
233
234 lower_state state = {
235 .type_size = type_size,
236 .large_ubos = large_ubos,
237 .large_ssbos = large_ssbos,
238 };
239
240 /* Figure out which UBOs or SSBOs are large enough to be
241 * disqualified from imul24:
242 */
243 nir_foreach_variable(var, &shader->uniforms) {
244 if (var->data.mode == nir_var_mem_ubo) {
245 assert(var->data.driver_location < num_uniforms);
246 if (is_large(&state, var)) {
247 state.has_large_ubo = true;
248 state.large_ubos[var->data.driver_location] = true;
249 }
250 } else if (var->data.mode == nir_var_mem_ssbo) {
251 assert(var->data.driver_location < num_uniforms);
252 if (is_large(&state, var)) {
253 state.has_large_ssbo = true;
254 state.large_ssbos[var->data.driver_location] = true;
255 }
256 }
257 }
258
259 /* clear pass flags: */
260 nir_foreach_function(function, shader) {
261 nir_function_impl *impl = function->impl;
262 if (!impl)
263 continue;
264
265 nir_foreach_block(block, impl) {
266 nir_foreach_instr(instr, block) {
267 instr->pass_flags = 0;
268 }
269 }
270 }
271
272 bool progress = false;
273 nir_foreach_function(function, shader) {
274 nir_function_impl *impl = function->impl;
275
276 if (!impl)
277 continue;
278
279 nir_foreach_block(block, impl) {
280 nir_foreach_instr(instr, block) {
281 progress |= lower_instr(&state, instr);
282 }
283 }
284 }
285
286 /* At this point, all 'amul's used in calculating an offset into
287 * a large variable have been replaced with 'imul'. So remaining
288 * 'amul's can be replaced with 'imul24':
289 */
290 nir_foreach_function(function, shader) {
291 nir_function_impl *impl = function->impl;
292
293 if (!impl)
294 continue;
295
296 nir_foreach_block(block, impl) {
297 nir_foreach_instr(instr, block) {
298 if (instr->type != nir_instr_type_alu)
299 continue;
300
301 nir_alu_instr *alu = nir_instr_as_alu(instr);
302 if (alu->op != nir_op_amul)
303 continue;
304
305 alu->op = nir_op_imul24;
306 progress |= true;
307 }
308 }
309
310 nir_metadata_preserve(impl, nir_metadata_block_index |
311 nir_metadata_dominance);
312
313 }
314
315 return progress;
316 }