2 * Copyright 2017 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 /* This file handles register programming of primitive binning. */
27 #include "si_build_pm4.h"
34 struct si_bin_size_map
{
40 typedef struct si_bin_size_map si_bin_size_subtable
[3][10];
42 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
43 static struct uvec2
si_find_bin_size(struct si_screen
*sscreen
,
44 const si_bin_size_subtable table
[],
47 unsigned log_num_rb_per_se
=
48 util_logbase2_ceil(sscreen
->info
.num_render_backends
/
49 sscreen
->info
.max_se
);
50 unsigned log_num_se
= util_logbase2_ceil(sscreen
->info
.max_se
);
53 /* Get the chip-specific subtable. */
54 const struct si_bin_size_map
*subtable
=
55 &table
[log_num_rb_per_se
][log_num_se
][0];
57 for (i
= 0; subtable
[i
].bin_size_x
!= 0; i
++) {
58 if (sum
>= subtable
[i
].start
&& sum
< subtable
[i
+ 1].start
)
62 struct uvec2 size
= {subtable
[i
].bin_size_x
, subtable
[i
].bin_size_y
};
66 static struct uvec2
si_get_color_bin_size(struct si_context
*sctx
,
67 unsigned cb_target_enabled_4bit
)
69 unsigned num_fragments
= sctx
->framebuffer
.nr_color_samples
;
72 /* Compute the sum of all Bpp. */
73 for (unsigned i
= 0; i
< sctx
->framebuffer
.state
.nr_cbufs
; i
++) {
74 if (!(cb_target_enabled_4bit
& (0xf << (i
* 4))))
77 struct si_texture
*tex
=
78 (struct si_texture
*)sctx
->framebuffer
.state
.cbufs
[i
]->texture
;
79 sum
+= tex
->surface
.bpe
;
82 /* Multiply the sum by some function of the number of samples. */
83 if (num_fragments
>= 2) {
84 if (si_get_ps_iter_samples(sctx
) >= 2)
90 static const si_bin_size_subtable table
[] = {
94 /* One shader engine */
102 /* Two shader engines */
110 /* Four shader engines */
120 /* One shader engine */
128 /* Two shader engines */
136 /* Four shader engines */
148 /* One shader engine */
157 /* Two shader engines */
167 /* Four shader engines */
179 return si_find_bin_size(sctx
->screen
, table
, sum
);
182 static struct uvec2
si_get_depth_bin_size(struct si_context
*sctx
)
184 struct si_state_dsa
*dsa
= sctx
->queued
.named
.dsa
;
186 if (!sctx
->framebuffer
.state
.zsbuf
||
187 (!dsa
->depth_enabled
&& !dsa
->stencil_enabled
)) {
188 /* Return the max size. */
189 struct uvec2 size
= {512, 512};
193 struct si_texture
*tex
=
194 (struct si_texture
*)sctx
->framebuffer
.state
.zsbuf
->texture
;
195 unsigned depth_coeff
= dsa
->depth_enabled
? 5 : 0;
196 unsigned stencil_coeff
= tex
->surface
.has_stencil
&&
197 dsa
->stencil_enabled
? 1 : 0;
198 unsigned sum
= 4 * (depth_coeff
+ stencil_coeff
) *
199 MAX2(tex
->buffer
.b
.b
.nr_samples
, 1);
201 static const si_bin_size_subtable table
[] = {
214 // Two shader engines
224 // Four shader engines
247 // Two shader engines
258 // Four shader engines
283 // Two shader engines
295 // Four shader engines
308 return si_find_bin_size(sctx
->screen
, table
, sum
);
311 static void si_emit_dpbb_disable(struct si_context
*sctx
)
313 unsigned initial_cdw
= sctx
->gfx_cs
->current
.cdw
;
315 if (sctx
->chip_class
>= GFX10
) {
316 struct uvec2 bin_size
= {};
317 struct uvec2 bin_size_extend
= {};
320 bin_size
.y
= sctx
->framebuffer
.min_bytes_per_pixel
<= 4 ? 128 : 64;
322 if (bin_size
.x
>= 32)
323 bin_size_extend
.x
= util_logbase2(bin_size
.x
) - 5;
324 if (bin_size
.y
>= 32)
325 bin_size_extend
.y
= util_logbase2(bin_size
.y
) - 5;
327 radeon_opt_set_context_reg(sctx
, R_028C44_PA_SC_BINNER_CNTL_0
,
328 SI_TRACKED_PA_SC_BINNER_CNTL_0
,
329 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC
) |
330 S_028C44_BIN_SIZE_X(bin_size
.x
== 16) |
331 S_028C44_BIN_SIZE_Y(bin_size
.y
== 16) |
332 S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend
.x
) |
333 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend
.y
) |
334 S_028C44_DISABLE_START_OF_PRIM(1) |
335 S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx
->last_binning_enabled
!= 0));
337 radeon_opt_set_context_reg(sctx
, R_028C44_PA_SC_BINNER_CNTL_0
,
338 SI_TRACKED_PA_SC_BINNER_CNTL_0
,
339 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC
) |
340 S_028C44_DISABLE_START_OF_PRIM(1) |
341 S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx
->family
== CHIP_VEGA12
||
342 sctx
->family
== CHIP_VEGA20
||
343 sctx
->family
>= CHIP_RAVEN2
) &&
344 sctx
->last_binning_enabled
!= 0));
347 unsigned db_dfsm_control
= sctx
->chip_class
>= GFX10
? R_028038_DB_DFSM_CONTROL
348 : R_028060_DB_DFSM_CONTROL
;
349 radeon_opt_set_context_reg(sctx
, db_dfsm_control
,
350 SI_TRACKED_DB_DFSM_CONTROL
,
351 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF
) |
352 S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
353 if (initial_cdw
!= sctx
->gfx_cs
->current
.cdw
)
354 sctx
->context_roll
= true;
356 sctx
->last_binning_enabled
= false;
359 void si_emit_dpbb_state(struct si_context
*sctx
)
361 struct si_screen
*sscreen
= sctx
->screen
;
362 struct si_state_blend
*blend
= sctx
->queued
.named
.blend
;
363 struct si_state_dsa
*dsa
= sctx
->queued
.named
.dsa
;
364 unsigned db_shader_control
= sctx
->ps_db_shader_control
;
366 assert(sctx
->chip_class
>= GFX9
);
368 if (!sscreen
->dpbb_allowed
|| !blend
|| !dsa
|| sctx
->dpbb_force_off
) {
369 si_emit_dpbb_disable(sctx
);
373 bool ps_can_kill
= G_02880C_KILL_ENABLE(db_shader_control
) ||
374 G_02880C_MASK_EXPORT_ENABLE(db_shader_control
) ||
375 G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control
) ||
376 blend
->alpha_to_coverage
;
378 bool db_can_reject_z_trivially
=
379 !G_02880C_Z_EXPORT_ENABLE(db_shader_control
) ||
380 G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control
) ||
381 G_02880C_DEPTH_BEFORE_SHADER(db_shader_control
);
383 /* Disable DPBB when it's believed to be inefficient. */
385 db_can_reject_z_trivially
&&
386 sctx
->framebuffer
.state
.zsbuf
&&
388 si_emit_dpbb_disable(sctx
);
392 /* Compute the bin size. */
393 /* TODO: We could also look at enabled pixel shader outputs. */
394 unsigned cb_target_enabled_4bit
= sctx
->framebuffer
.colorbuf_enabled_4bit
&
395 blend
->cb_target_enabled_4bit
;
396 struct uvec2 color_bin_size
=
397 si_get_color_bin_size(sctx
, cb_target_enabled_4bit
);
398 struct uvec2 depth_bin_size
= si_get_depth_bin_size(sctx
);
400 unsigned color_area
= color_bin_size
.x
* color_bin_size
.y
;
401 unsigned depth_area
= depth_bin_size
.x
* depth_bin_size
.y
;
403 struct uvec2 bin_size
= color_area
< depth_area
? color_bin_size
406 if (!bin_size
.x
|| !bin_size
.y
) {
407 si_emit_dpbb_disable(sctx
);
411 /* Enable DFSM if it's preferred. */
412 unsigned punchout_mode
= V_028060_FORCE_OFF
;
413 bool disable_start_of_prim
= true;
414 bool zs_eqaa_dfsm_bug
= sctx
->chip_class
== GFX9
&&
415 sctx
->framebuffer
.state
.zsbuf
&&
416 sctx
->framebuffer
.nr_samples
!=
417 MAX2(1, sctx
->framebuffer
.state
.zsbuf
->texture
->nr_samples
);
419 if (sscreen
->dfsm_allowed
&&
421 cb_target_enabled_4bit
&&
422 !G_02880C_KILL_ENABLE(db_shader_control
) &&
423 /* These two also imply that DFSM is disabled when PS writes to memory. */
424 !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control
) &&
425 !G_02880C_EXEC_ON_NOOP(db_shader_control
) &&
426 G_02880C_Z_ORDER(db_shader_control
) == V_02880C_EARLY_Z_THEN_LATE_Z
) {
427 punchout_mode
= V_028060_AUTO
;
428 disable_start_of_prim
= (cb_target_enabled_4bit
&
429 blend
->blend_enable_4bit
) != 0;
432 /* Tunable parameters. Also test with DFSM enabled/disabled. */
433 unsigned context_states_per_bin
; /* allowed range: [1, 6] */
434 unsigned persistent_states_per_bin
; /* allowed range: [1, 32] */
435 unsigned fpovs_per_batch
; /* allowed range: [0, 255], 0 = unlimited */
437 /* Tuned for Raven. Vega might need different values. */
438 if (sscreen
->info
.has_dedicated_vram
) {
439 context_states_per_bin
= 1;
440 persistent_states_per_bin
= 1;
442 context_states_per_bin
= 6;
443 persistent_states_per_bin
= 32;
445 fpovs_per_batch
= 63;
447 /* Emit registers. */
448 struct uvec2 bin_size_extend
= {};
449 if (bin_size
.x
>= 32)
450 bin_size_extend
.x
= util_logbase2(bin_size
.x
) - 5;
451 if (bin_size
.y
>= 32)
452 bin_size_extend
.y
= util_logbase2(bin_size
.y
) - 5;
454 unsigned initial_cdw
= sctx
->gfx_cs
->current
.cdw
;
455 radeon_opt_set_context_reg(
456 sctx
, R_028C44_PA_SC_BINNER_CNTL_0
,
457 SI_TRACKED_PA_SC_BINNER_CNTL_0
,
458 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED
) |
459 S_028C44_BIN_SIZE_X(bin_size
.x
== 16) |
460 S_028C44_BIN_SIZE_Y(bin_size
.y
== 16) |
461 S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend
.x
) |
462 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend
.y
) |
463 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin
- 1) |
464 S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin
- 1) |
465 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim
) |
466 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch
) |
467 S_028C44_OPTIMAL_BIN_SELECTION(1) |
468 G_028C44_FLUSH_ON_BINNING_TRANSITION(sctx
->last_binning_enabled
!= 1));
470 unsigned db_dfsm_control
= sctx
->chip_class
>= GFX10
? R_028038_DB_DFSM_CONTROL
471 : R_028060_DB_DFSM_CONTROL
;
472 radeon_opt_set_context_reg(sctx
, db_dfsm_control
,
473 SI_TRACKED_DB_DFSM_CONTROL
,
474 S_028060_PUNCHOUT_MODE(punchout_mode
) |
475 S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
476 if (initial_cdw
!= sctx
->gfx_cs
->current
.cdw
)
477 sctx
->context_roll
= true;
479 sctx
->last_binning_enabled
= true;