2 * Copyright 2017 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 /* This file handles register programming of primitive binning. */
27 #include "si_build_pm4.h"
34 struct si_bin_size_map
{
40 typedef struct si_bin_size_map si_bin_size_subtable
[3][10];
42 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
43 static struct uvec2
si_find_bin_size(struct si_screen
*sscreen
, const si_bin_size_subtable table
[],
46 unsigned log_num_rb_per_se
=
47 util_logbase2_ceil(sscreen
->info
.num_render_backends
/ sscreen
->info
.max_se
);
48 unsigned log_num_se
= util_logbase2_ceil(sscreen
->info
.max_se
);
51 /* Get the chip-specific subtable. */
52 const struct si_bin_size_map
*subtable
= &table
[log_num_rb_per_se
][log_num_se
][0];
54 for (i
= 0; subtable
[i
].bin_size_x
!= 0; i
++) {
55 if (sum
>= subtable
[i
].start
&& sum
< subtable
[i
+ 1].start
)
59 struct uvec2 size
= {subtable
[i
].bin_size_x
, subtable
[i
].bin_size_y
};
63 static struct uvec2
si_get_color_bin_size(struct si_context
*sctx
, unsigned cb_target_enabled_4bit
)
65 unsigned num_fragments
= sctx
->framebuffer
.nr_color_samples
;
68 /* Compute the sum of all Bpp. */
69 for (unsigned i
= 0; i
< sctx
->framebuffer
.state
.nr_cbufs
; i
++) {
70 if (!(cb_target_enabled_4bit
& (0xf << (i
* 4))))
73 struct si_texture
*tex
= (struct si_texture
*)sctx
->framebuffer
.state
.cbufs
[i
]->texture
;
74 sum
+= tex
->surface
.bpe
;
77 /* Multiply the sum by some function of the number of samples. */
78 if (num_fragments
>= 2) {
79 if (si_get_ps_iter_samples(sctx
) >= 2)
85 static const si_bin_size_subtable table
[] = {
89 /* One shader engine */
97 /* Two shader engines */
105 /* Four shader engines */
115 /* One shader engine */
123 /* Two shader engines */
131 /* Four shader engines */
143 /* One shader engine */
152 /* Two shader engines */
162 /* Four shader engines */
174 return si_find_bin_size(sctx
->screen
, table
, sum
);
177 static struct uvec2
si_get_depth_bin_size(struct si_context
*sctx
)
179 struct si_state_dsa
*dsa
= sctx
->queued
.named
.dsa
;
181 if (!sctx
->framebuffer
.state
.zsbuf
|| (!dsa
->depth_enabled
&& !dsa
->stencil_enabled
)) {
182 /* Return the max size. */
183 struct uvec2 size
= {512, 512};
187 struct si_texture
*tex
= (struct si_texture
*)sctx
->framebuffer
.state
.zsbuf
->texture
;
188 unsigned depth_coeff
= dsa
->depth_enabled
? 5 : 0;
189 unsigned stencil_coeff
= tex
->surface
.has_stencil
&& dsa
->stencil_enabled
? 1 : 0;
190 unsigned sum
= 4 * (depth_coeff
+ stencil_coeff
) * MAX2(tex
->buffer
.b
.b
.nr_samples
, 1);
192 static const si_bin_size_subtable table
[] = {
205 // Two shader engines
215 // Four shader engines
238 // Two shader engines
249 // Four shader engines
274 // Two shader engines
286 // Four shader engines
299 return si_find_bin_size(sctx
->screen
, table
, sum
);
302 static void gfx10_get_bin_sizes(struct si_context
*sctx
, unsigned cb_target_enabled_4bit
,
303 struct uvec2
*color_bin_size
, struct uvec2
*depth_bin_size
)
305 const unsigned ZsTagSize
= 64;
306 const unsigned ZsNumTags
= 312;
307 const unsigned CcTagSize
= 1024;
308 const unsigned CcReadTags
= 31;
309 const unsigned FcTagSize
= 256;
310 const unsigned FcReadTags
= 44;
312 const unsigned num_rbs
= sctx
->screen
->info
.num_render_backends
;
313 const unsigned num_pipes
= MAX2(num_rbs
, sctx
->screen
->info
.num_sdp_interfaces
);
315 const unsigned depthBinSizeTagPart
=
316 ((ZsNumTags
* num_rbs
/ num_pipes
) * (ZsTagSize
* num_pipes
));
317 const unsigned colorBinSizeTagPart
=
318 ((CcReadTags
* num_rbs
/ num_pipes
) * (CcTagSize
* num_pipes
));
319 const unsigned fmaskBinSizeTagPart
=
320 ((FcReadTags
* num_rbs
/ num_pipes
) * (FcTagSize
* num_pipes
));
322 const unsigned minBinSizeX
= 128;
323 const unsigned minBinSizeY
= 64;
325 const unsigned num_fragments
= sctx
->framebuffer
.nr_color_samples
;
326 const unsigned num_samples
= sctx
->framebuffer
.nr_samples
;
327 const bool ps_iter_sample
= si_get_ps_iter_samples(sctx
) >= 2;
329 /* Calculate cColor and cFmask(if applicable) */
332 bool has_fmask
= false;
334 for (unsigned i
= 0; i
< sctx
->framebuffer
.state
.nr_cbufs
; i
++) {
335 if (!sctx
->framebuffer
.state
.cbufs
[i
])
338 struct si_texture
*tex
= (struct si_texture
*)sctx
->framebuffer
.state
.cbufs
[i
]->texture
;
339 const unsigned mmrt
= num_fragments
== 1 ? 1 : (ps_iter_sample
? num_fragments
: 2);
341 cColor
+= tex
->surface
.bpe
* mmrt
;
342 if (num_samples
>= 2 /* if FMASK is bound */) {
343 const unsigned fragmentsLog2
= util_logbase2(num_fragments
);
344 const unsigned samplesLog2
= util_logbase2(num_samples
);
346 static const unsigned cFmaskMrt
[4 /* fragments */][5 /* samples */] = {
347 {0, 1, 1, 1, 2}, /* fragments = 1 */
348 {0, 1, 1, 2, 4}, /* fragments = 2 */
349 {0, 1, 1, 4, 8}, /* fragments = 4 */
350 {0, 1, 2, 4, 8} /* fragments = 8 */
352 cFmask
+= cFmaskMrt
[fragmentsLog2
][samplesLog2
];
356 cColor
= MAX2(cColor
, 1u);
358 const unsigned colorLog2Pixels
= util_logbase2(colorBinSizeTagPart
/ cColor
);
359 const unsigned colorBinSizeX
= 1 << ((colorLog2Pixels
+ 1) / 2); /* round up width */
360 const unsigned colorBinSizeY
= 1 << (colorLog2Pixels
/ 2); /* round down height */
362 unsigned binSizeX
= colorBinSizeX
;
363 unsigned binSizeY
= colorBinSizeY
;
366 cFmask
= MAX2(cFmask
, 1u);
368 const unsigned fmaskLog2Pixels
= util_logbase2(fmaskBinSizeTagPart
/ cFmask
);
369 const unsigned fmaskBinSizeX
= 1 << ((fmaskLog2Pixels
+ 1) / 2); /* round up width */
370 const unsigned fmaskBinSizeY
= 1 << (fmaskLog2Pixels
/ 2); /* round down height */
372 /* use the smaller of the Color vs. Fmask bin sizes */
373 if (fmaskLog2Pixels
< colorLog2Pixels
) {
374 binSizeX
= fmaskBinSizeX
;
375 binSizeY
= fmaskBinSizeY
;
379 /* Return size adjusted for minimum bin size */
380 color_bin_size
->x
= MAX2(binSizeX
, minBinSizeX
);
381 color_bin_size
->y
= MAX2(binSizeY
, minBinSizeY
);
383 if (!sctx
->framebuffer
.state
.zsbuf
) {
384 /* Set to max sizes when no depth buffer is bound. */
385 depth_bin_size
->x
= 512;
386 depth_bin_size
->y
= 512;
388 struct si_texture
*zstex
= (struct si_texture
*)sctx
->framebuffer
.state
.zsbuf
->texture
;
389 struct si_state_dsa
*dsa
= sctx
->queued
.named
.dsa
;
391 const unsigned cPerDepthSample
= dsa
->depth_enabled
? 5 : 0;
392 const unsigned cPerStencilSample
= dsa
->stencil_enabled
? 1 : 0;
393 const unsigned cDepth
=
394 (cPerDepthSample
+ cPerStencilSample
) * MAX2(zstex
->buffer
.b
.b
.nr_samples
, 1);
396 const unsigned depthLog2Pixels
= util_logbase2(depthBinSizeTagPart
/ MAX2(cDepth
, 1u));
397 unsigned depthBinSizeX
= 1 << ((depthLog2Pixels
+ 1) / 2);
398 unsigned depthBinSizeY
= 1 << (depthLog2Pixels
/ 2);
400 depth_bin_size
->x
= MAX2(depthBinSizeX
, minBinSizeX
);
401 depth_bin_size
->y
= MAX2(depthBinSizeY
, minBinSizeY
);
405 static void si_emit_dpbb_disable(struct si_context
*sctx
)
407 unsigned initial_cdw
= sctx
->gfx_cs
->current
.cdw
;
409 if (sctx
->chip_class
>= GFX10
) {
410 struct uvec2 bin_size
= {};
411 struct uvec2 bin_size_extend
= {};
414 bin_size
.y
= sctx
->framebuffer
.min_bytes_per_pixel
<= 4 ? 128 : 64;
416 if (bin_size
.x
>= 32)
417 bin_size_extend
.x
= util_logbase2(bin_size
.x
) - 5;
418 if (bin_size
.y
>= 32)
419 bin_size_extend
.y
= util_logbase2(bin_size
.y
) - 5;
421 radeon_opt_set_context_reg(
422 sctx
, R_028C44_PA_SC_BINNER_CNTL_0
, SI_TRACKED_PA_SC_BINNER_CNTL_0
,
423 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC
) |
424 S_028C44_BIN_SIZE_X(bin_size
.x
== 16) | S_028C44_BIN_SIZE_Y(bin_size
.y
== 16) |
425 S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend
.x
) |
426 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend
.y
) | S_028C44_DISABLE_START_OF_PRIM(1) |
427 S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx
->last_binning_enabled
!= 0));
429 radeon_opt_set_context_reg(
430 sctx
, R_028C44_PA_SC_BINNER_CNTL_0
, SI_TRACKED_PA_SC_BINNER_CNTL_0
,
431 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC
) |
432 S_028C44_DISABLE_START_OF_PRIM(1) |
433 S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx
->family
== CHIP_VEGA12
||
434 sctx
->family
== CHIP_VEGA20
||
435 sctx
->family
>= CHIP_RAVEN2
) &&
436 sctx
->last_binning_enabled
!= 0));
439 unsigned db_dfsm_control
=
440 sctx
->chip_class
>= GFX10
? R_028038_DB_DFSM_CONTROL
: R_028060_DB_DFSM_CONTROL
;
441 radeon_opt_set_context_reg(
442 sctx
, db_dfsm_control
, SI_TRACKED_DB_DFSM_CONTROL
,
443 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF
) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
444 if (initial_cdw
!= sctx
->gfx_cs
->current
.cdw
)
445 sctx
->context_roll
= true;
447 sctx
->last_binning_enabled
= false;
450 void si_emit_dpbb_state(struct si_context
*sctx
)
452 struct si_screen
*sscreen
= sctx
->screen
;
453 struct si_state_blend
*blend
= sctx
->queued
.named
.blend
;
454 struct si_state_dsa
*dsa
= sctx
->queued
.named
.dsa
;
455 unsigned db_shader_control
= sctx
->ps_db_shader_control
;
457 assert(sctx
->chip_class
>= GFX9
);
459 if (!sscreen
->dpbb_allowed
|| sctx
->dpbb_force_off
) {
460 si_emit_dpbb_disable(sctx
);
465 G_02880C_KILL_ENABLE(db_shader_control
) || G_02880C_MASK_EXPORT_ENABLE(db_shader_control
) ||
466 G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control
) || blend
->alpha_to_coverage
;
468 bool db_can_reject_z_trivially
= !G_02880C_Z_EXPORT_ENABLE(db_shader_control
) ||
469 G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control
) ||
470 G_02880C_DEPTH_BEFORE_SHADER(db_shader_control
);
472 /* Disable DPBB when it's believed to be inefficient. */
473 if (sscreen
->info
.num_render_backends
> 4 && ps_can_kill
&& db_can_reject_z_trivially
&&
474 sctx
->framebuffer
.state
.zsbuf
&& dsa
->db_can_write
) {
475 si_emit_dpbb_disable(sctx
);
479 /* Compute the bin size. */
480 /* TODO: We could also look at enabled pixel shader outputs. */
481 unsigned cb_target_enabled_4bit
=
482 sctx
->framebuffer
.colorbuf_enabled_4bit
& blend
->cb_target_enabled_4bit
;
483 struct uvec2 color_bin_size
, depth_bin_size
;
485 if (sctx
->chip_class
>= GFX10
) {
486 gfx10_get_bin_sizes(sctx
, cb_target_enabled_4bit
, &color_bin_size
, &depth_bin_size
);
488 color_bin_size
= si_get_color_bin_size(sctx
, cb_target_enabled_4bit
);
489 depth_bin_size
= si_get_depth_bin_size(sctx
);
492 unsigned color_area
= color_bin_size
.x
* color_bin_size
.y
;
493 unsigned depth_area
= depth_bin_size
.x
* depth_bin_size
.y
;
495 struct uvec2 bin_size
= color_area
< depth_area
? color_bin_size
: depth_bin_size
;
497 if (!bin_size
.x
|| !bin_size
.y
) {
498 si_emit_dpbb_disable(sctx
);
502 /* Enable DFSM if it's preferred. */
503 unsigned punchout_mode
= V_028060_FORCE_OFF
;
504 bool disable_start_of_prim
= true;
505 bool zs_eqaa_dfsm_bug
=
506 sctx
->chip_class
== GFX9
&& sctx
->framebuffer
.state
.zsbuf
&&
507 sctx
->framebuffer
.nr_samples
!= MAX2(1, sctx
->framebuffer
.state
.zsbuf
->texture
->nr_samples
);
509 if (sscreen
->dfsm_allowed
&& !zs_eqaa_dfsm_bug
&& cb_target_enabled_4bit
&&
510 !G_02880C_KILL_ENABLE(db_shader_control
) &&
511 /* These two also imply that DFSM is disabled when PS writes to memory. */
512 !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control
) &&
513 !G_02880C_EXEC_ON_NOOP(db_shader_control
) &&
514 G_02880C_Z_ORDER(db_shader_control
) == V_02880C_EARLY_Z_THEN_LATE_Z
) {
515 punchout_mode
= V_028060_AUTO
;
516 disable_start_of_prim
= (cb_target_enabled_4bit
& blend
->blend_enable_4bit
) != 0;
519 /* Tunable parameters. Also test with DFSM enabled/disabled. */
520 unsigned context_states_per_bin
; /* allowed range: [1, 6] */
521 unsigned persistent_states_per_bin
; /* allowed range: [1, 32] */
522 unsigned fpovs_per_batch
; /* allowed range: [0, 255], 0 = unlimited */
524 /* Tuned for Raven. Vega might need different values. */
525 if (sscreen
->info
.has_dedicated_vram
) {
526 if (sscreen
->info
.num_render_backends
> 4) {
527 context_states_per_bin
= 1;
528 persistent_states_per_bin
= 1;
530 context_states_per_bin
= 3;
531 persistent_states_per_bin
= 8;
534 /* This is a workaround for:
535 * https://bugs.freedesktop.org/show_bug.cgi?id=110214
536 * (an alternative is to insert manual BATCH_BREAK event when
537 * a context_roll is detected). */
538 context_states_per_bin
= sctx
->screen
->info
.has_gfx9_scissor_bug
? 1 : 6;
539 /* Using 32 here can cause GPU hangs on RAVEN1 */
540 persistent_states_per_bin
= 16;
542 fpovs_per_batch
= 63;
544 /* Emit registers. */
545 struct uvec2 bin_size_extend
= {};
546 if (bin_size
.x
>= 32)
547 bin_size_extend
.x
= util_logbase2(bin_size
.x
) - 5;
548 if (bin_size
.y
>= 32)
549 bin_size_extend
.y
= util_logbase2(bin_size
.y
) - 5;
551 unsigned initial_cdw
= sctx
->gfx_cs
->current
.cdw
;
552 radeon_opt_set_context_reg(
553 sctx
, R_028C44_PA_SC_BINNER_CNTL_0
, SI_TRACKED_PA_SC_BINNER_CNTL_0
,
554 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED
) | S_028C44_BIN_SIZE_X(bin_size
.x
== 16) |
555 S_028C44_BIN_SIZE_Y(bin_size
.y
== 16) | S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend
.x
) |
556 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend
.y
) |
557 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin
- 1) |
558 S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin
- 1) |
559 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim
) |
560 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch
) | S_028C44_OPTIMAL_BIN_SELECTION(1) |
561 S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx
->family
== CHIP_VEGA12
||
562 sctx
->family
== CHIP_VEGA20
||
563 sctx
->family
>= CHIP_RAVEN2
) &&
564 sctx
->last_binning_enabled
!= 1));
566 unsigned db_dfsm_control
=
567 sctx
->chip_class
>= GFX10
? R_028038_DB_DFSM_CONTROL
: R_028060_DB_DFSM_CONTROL
;
568 radeon_opt_set_context_reg(
569 sctx
, db_dfsm_control
, SI_TRACKED_DB_DFSM_CONTROL
,
570 S_028060_PUNCHOUT_MODE(punchout_mode
) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
571 if (initial_cdw
!= sctx
->gfx_cs
->current
.cdw
)
572 sctx
->context_roll
= true;
574 sctx
->last_binning_enabled
= true;