radeonsi: update copyrights
[mesa.git] / src / gallium / drivers / radeonsi / si_state_binning.c
1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /* This file handles register programming of primitive binning. */
26
27 #include "si_pipe.h"
28 #include "sid.h"
29 #include "gfx9d.h"
30 #include "radeon/r600_cs.h"
31
32 struct uvec2 {
33 unsigned x, y;
34 };
35
36 struct si_bin_size_map {
37 unsigned start;
38 unsigned bin_size_x;
39 unsigned bin_size_y;
40 };
41
42 typedef struct si_bin_size_map si_bin_size_subtable[3][9];
43
44 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
45 static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
46 const si_bin_size_subtable table[],
47 unsigned sum)
48 {
49 unsigned log_num_rb_per_se =
50 util_logbase2_ceil(sscreen->info.num_render_backends /
51 sscreen->info.max_se);
52 unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
53 unsigned i;
54
55 /* Get the chip-specific subtable. */
56 const struct si_bin_size_map *subtable =
57 &table[log_num_rb_per_se][log_num_se][0];
58
59 for (i = 0; subtable[i].start != UINT_MAX; i++) {
60 if (sum >= subtable[i].start && sum < subtable[i + 1].start)
61 break;
62 }
63
64 struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
65 return size;
66 }
67
68 static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
69 unsigned cb_target_enabled_4bit)
70 {
71 unsigned nr_samples = sctx->framebuffer.nr_samples;
72 unsigned sum = 0;
73
74 /* Compute the sum of all Bpp. */
75 for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
76 if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
77 continue;
78
79 struct r600_texture *rtex =
80 (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
81 sum += rtex->surface.bpe;
82 }
83
84 /* Multiply the sum by some function of the number of samples. */
85 if (nr_samples >= 2) {
86 if (si_get_ps_iter_samples(sctx) >= 2)
87 sum *= nr_samples;
88 else
89 sum *= 2;
90 }
91
92 static const si_bin_size_subtable table[] = {
93 {
94 /* One RB / SE */
95 {
96 /* One shader engine */
97 { 0, 128, 128 },
98 { 1, 64, 128 },
99 { 2, 32, 128 },
100 { 3, 16, 128 },
101 { 17, 0, 0 },
102 { UINT_MAX, 0, 0 },
103 },
104 {
105 /* Two shader engines */
106 { 0, 128, 128 },
107 { 2, 64, 128 },
108 { 3, 32, 128 },
109 { 5, 16, 128 },
110 { 17, 0, 0 },
111 { UINT_MAX, 0, 0 },
112 },
113 {
114 /* Four shader engines */
115 { 0, 128, 128 },
116 { 3, 64, 128 },
117 { 5, 16, 128 },
118 { 17, 0, 0 },
119 { UINT_MAX, 0, 0 },
120 },
121 },
122 {
123 /* Two RB / SE */
124 {
125 /* One shader engine */
126 { 0, 128, 128 },
127 { 2, 64, 128 },
128 { 3, 32, 128 },
129 { 5, 16, 128 },
130 { 33, 0, 0 },
131 { UINT_MAX, 0, 0 },
132 },
133 {
134 /* Two shader engines */
135 { 0, 128, 128 },
136 { 3, 64, 128 },
137 { 5, 32, 128 },
138 { 9, 16, 128 },
139 { 33, 0, 0 },
140 { UINT_MAX, 0, 0 },
141 },
142 {
143 /* Four shader engines */
144 { 0, 256, 256 },
145 { 2, 128, 256 },
146 { 3, 128, 128 },
147 { 5, 64, 128 },
148 { 9, 16, 128 },
149 { 33, 0, 0 },
150 { UINT_MAX, 0, 0 },
151 },
152 },
153 {
154 /* Four RB / SE */
155 {
156 /* One shader engine */
157 { 0, 128, 256 },
158 { 2, 128, 128 },
159 { 3, 64, 128 },
160 { 5, 32, 128 },
161 { 9, 16, 128 },
162 { 33, 0, 0 },
163 { UINT_MAX, 0, 0 },
164 },
165 {
166 /* Two shader engines */
167 { 0, 256, 256 },
168 { 2, 128, 256 },
169 { 3, 128, 128 },
170 { 5, 64, 128 },
171 { 9, 32, 128 },
172 { 17, 16, 128 },
173 { 33, 0, 0 },
174 { UINT_MAX, 0, 0 },
175 },
176 {
177 /* Four shader engines */
178 { 0, 256, 512 },
179 { 2, 256, 256 },
180 { 3, 128, 256 },
181 { 5, 128, 128 },
182 { 9, 64, 128 },
183 { 17, 16, 128 },
184 { 33, 0, 0 },
185 { UINT_MAX, 0, 0 },
186 },
187 },
188 };
189
190 return si_find_bin_size(sctx->screen, table, sum);
191 }
192
193 static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
194 {
195 struct si_state_dsa *dsa = sctx->queued.named.dsa;
196
197 if (!sctx->framebuffer.state.zsbuf ||
198 (!dsa->depth_enabled && !dsa->stencil_enabled)) {
199 /* Return the max size. */
200 struct uvec2 size = {512, 512};
201 return size;
202 }
203
204 struct r600_texture *rtex =
205 (struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
206 unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
207 unsigned stencil_coeff = rtex->surface.has_stencil &&
208 dsa->stencil_enabled ? 1 : 0;
209 unsigned sum = 4 * (depth_coeff + stencil_coeff) *
210 sctx->framebuffer.nr_samples;
211
212 static const si_bin_size_subtable table[] = {
213 {
214 // One RB / SE
215 {
216 // One shader engine
217 { 0, 128, 256 },
218 { 2, 128, 128 },
219 { 4, 64, 128 },
220 { 7, 32, 128 },
221 { 13, 16, 128 },
222 { 49, 0, 0 },
223 { UINT_MAX, 0, 0 },
224 },
225 {
226 // Two shader engines
227 { 0, 256, 256 },
228 { 2, 128, 256 },
229 { 4, 128, 128 },
230 { 7, 64, 128 },
231 { 13, 32, 128 },
232 { 25, 16, 128 },
233 { 49, 0, 0 },
234 { UINT_MAX, 0, 0 },
235 },
236 {
237 // Four shader engines
238 { 0, 256, 512 },
239 { 2, 256, 256 },
240 { 4, 128, 256 },
241 { 7, 128, 128 },
242 { 13, 64, 128 },
243 { 25, 16, 128 },
244 { 49, 0, 0 },
245 { UINT_MAX, 0, 0 },
246 },
247 },
248 {
249 // Two RB / SE
250 {
251 // One shader engine
252 { 0, 256, 256 },
253 { 2, 128, 256 },
254 { 4, 128, 128 },
255 { 7, 64, 128 },
256 { 13, 32, 128 },
257 { 25, 16, 128 },
258 { 97, 0, 0 },
259 { UINT_MAX, 0, 0 },
260 },
261 {
262 // Two shader engines
263 { 0, 256, 512 },
264 { 2, 256, 256 },
265 { 4, 128, 256 },
266 { 7, 128, 128 },
267 { 13, 64, 128 },
268 { 25, 32, 128 },
269 { 49, 16, 128 },
270 { 97, 0, 0 },
271 { UINT_MAX, 0, 0 },
272 },
273 {
274 // Four shader engines
275 { 0, 512, 512 },
276 { 2, 256, 512 },
277 { 4, 256, 256 },
278 { 7, 128, 256 },
279 { 13, 128, 128 },
280 { 25, 64, 128 },
281 { 49, 16, 128 },
282 { 97, 0, 0 },
283 { UINT_MAX, 0, 0 },
284 },
285 },
286 {
287 // Four RB / SE
288 {
289 // One shader engine
290 { 0, 256, 512 },
291 { 2, 256, 256 },
292 { 4, 128, 256 },
293 { 7, 128, 128 },
294 { 13, 64, 128 },
295 { 25, 32, 128 },
296 { 49, 16, 128 },
297 { UINT_MAX, 0, 0 },
298 },
299 {
300 // Two shader engines
301 { 0, 512, 512 },
302 { 2, 256, 512 },
303 { 4, 256, 256 },
304 { 7, 128, 256 },
305 { 13, 128, 128 },
306 { 25, 64, 128 },
307 { 49, 32, 128 },
308 { 97, 16, 128 },
309 { UINT_MAX, 0, 0 },
310 },
311 {
312 // Four shader engines
313 { 0, 512, 512 },
314 { 4, 256, 512 },
315 { 7, 256, 256 },
316 { 13, 128, 256 },
317 { 25, 128, 128 },
318 { 49, 64, 128 },
319 { 97, 16, 128 },
320 { UINT_MAX, 0, 0 },
321 },
322 },
323 };
324
325 return si_find_bin_size(sctx->screen, table, sum);
326 }
327
328 static void si_emit_dpbb_disable(struct si_context *sctx)
329 {
330 struct radeon_winsys_cs *cs = sctx->b.gfx_cs;
331
332 radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
333 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
334 S_028C44_DISABLE_START_OF_PRIM(1));
335 radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
336 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
337 }
338
339 void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
340 {
341 struct si_screen *sscreen = sctx->screen;
342 struct si_state_blend *blend = sctx->queued.named.blend;
343 struct si_state_dsa *dsa = sctx->queued.named.dsa;
344 unsigned db_shader_control = sctx->ps_db_shader_control;
345
346 assert(sctx->b.chip_class >= GFX9);
347
348 if (!sscreen->dpbb_allowed || !blend || !dsa) {
349 si_emit_dpbb_disable(sctx);
350 return;
351 }
352
353 bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) ||
354 G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
355 G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
356 blend->alpha_to_coverage;
357
358 /* This is ported from Vulkan, but it doesn't make much sense to me.
359 * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this.
360 */
361 bool ps_can_reject_z_trivially =
362 !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
363 G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control);
364
365 /* Disable binning if PS can kill trivially with DB writes.
366 * Ported from Vulkan. (heuristic?)
367 */
368 if (ps_can_kill &&
369 ps_can_reject_z_trivially &&
370 sctx->framebuffer.state.zsbuf &&
371 dsa->db_can_write) {
372 si_emit_dpbb_disable(sctx);
373 return;
374 }
375
376 /* Compute the bin size. */
377 /* TODO: We could also look at enabled pixel shader outputs. */
378 unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit &
379 blend->cb_target_enabled_4bit;
380 struct uvec2 color_bin_size =
381 si_get_color_bin_size(sctx, cb_target_enabled_4bit);
382 struct uvec2 depth_bin_size = si_get_depth_bin_size(sctx);
383
384 unsigned color_area = color_bin_size.x * color_bin_size.y;
385 unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
386
387 struct uvec2 bin_size = color_area < depth_area ? color_bin_size
388 : depth_bin_size;
389
390 if (!bin_size.x || !bin_size.y) {
391 si_emit_dpbb_disable(sctx);
392 return;
393 }
394
395 /* Enable DFSM if it's preferred. */
396 unsigned punchout_mode = V_028060_FORCE_OFF;
397 bool disable_start_of_prim = true;
398
399 if (sscreen->dfsm_allowed &&
400 cb_target_enabled_4bit &&
401 !G_02880C_KILL_ENABLE(db_shader_control) &&
402 /* These two also imply that DFSM is disabled when PS writes to memory. */
403 !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
404 !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
405 G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
406 punchout_mode = V_028060_AUTO;
407 disable_start_of_prim = (cb_target_enabled_4bit &
408 blend->blend_enable_4bit) != 0;
409 }
410
411 /* Tunable parameters. Also test with DFSM enabled/disabled. */
412 unsigned context_states_per_bin; /* allowed range: [0, 5] */
413 unsigned persistent_states_per_bin; /* allowed range: [0, 31] */
414 unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
415
416 switch (sctx->b.family) {
417 case CHIP_VEGA10:
418 case CHIP_VEGA12:
419 case CHIP_RAVEN:
420 /* Tuned for Raven. Vega might need different values. */
421 context_states_per_bin = 5;
422 persistent_states_per_bin = 31;
423 fpovs_per_batch = 63;
424 break;
425 default:
426 assert(0);
427 }
428
429 /* Emit registers. */
430 struct uvec2 bin_size_extend = {};
431 if (bin_size.x >= 32)
432 bin_size_extend.x = util_logbase2(bin_size.x) - 5;
433 if (bin_size.y >= 32)
434 bin_size_extend.y = util_logbase2(bin_size.y) - 5;
435
436 struct radeon_winsys_cs *cs = sctx->b.gfx_cs;
437 radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
438 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
439 S_028C44_BIN_SIZE_X(bin_size.x == 16) |
440 S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
441 S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
442 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
443 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
444 S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
445 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
446 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
447 S_028C44_OPTIMAL_BIN_SELECTION(1));
448 radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
449 S_028060_PUNCHOUT_MODE(punchout_mode));
450 }