radv: Implement binning on GFX9.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sat, 30 Dec 2017 16:31:44 +0000 (17:31 +0100)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 31 Dec 2017 14:07:07 +0000 (15:07 +0100)
Overall it does not really help or hurt. The deferred demo gets 1%
improvement and some games a 3% decrease, so I don't think this
should be enabled by default.

But with the code upstream it is easier to experiment with it.

v2: Remove initializing the registers from si_emit_config.

Reviewed-by: Dave Airlie <airlied@redhat.com>
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_pipeline.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/si_cmd_buffer.c

index c735d2018023a61c9ae85d505b7cb8ad93f4439d..261344e939b33b0eb5717c98ac9ef57914953c7b 100644 (file)
@@ -1042,6 +1042,21 @@ radv_emit_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer,
                               pipeline->graphics.vtx_reuse_depth);
 }
 
+static void
+radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer,
+                          struct radv_pipeline *pipeline)
+{
+       struct radeon_winsys_cs *cs = cmd_buffer->cs;
+
+       if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
+               return;
+
+       radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
+                              pipeline->graphics.bin.pa_sc_binner_cntl_0);
+       radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
+                              pipeline->graphics.bin.db_dfsm_control);
+}
+
 static void
 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
 {
@@ -1059,6 +1074,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
        radv_emit_geometry_shader(cmd_buffer, pipeline);
        radv_emit_fragment_shader(cmd_buffer, pipeline);
        radv_emit_vgt_vertex_reuse(cmd_buffer, pipeline);
+       radv_emit_binning_state(cmd_buffer, pipeline);
 
        cmd_buffer->scratch_size_needed =
                                  MAX2(cmd_buffer->scratch_size_needed,
index 14ada20d525c4bf24ed7d04fccee5c411142743c..9b5728ee9e74856e5a5a8ca88a05c57d4a638e7c 100644 (file)
@@ -2002,6 +2002,329 @@ radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline,
        }
 }
 
+struct radv_bin_size_entry {
+       unsigned bpp;
+       VkExtent2D extent;
+};
+
+static VkExtent2D
+radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+       static const struct radv_bin_size_entry color_size_table[][3][9] = {
+               {
+                       /* One RB / SE */
+                       {
+                               /* One shader engine */
+                               {        0, {128,  128}},
+                               {        1, { 64,  128}},
+                               {        2, { 32,  128}},
+                               {        3, { 16,  128}},
+                               {       17, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               /* Two shader engines */
+                               {        0, {128,  128}},
+                               {        2, { 64,  128}},
+                               {        3, { 32,  128}},
+                               {        5, { 16,  128}},
+                               {       17, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               /* Four shader engines */
+                               {        0, {128,  128}},
+                               {        3, { 64,  128}},
+                               {        5, { 16,  128}},
+                               {       17, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+               },
+               {
+                       /* Two RB / SE */
+                       {
+                               /* One shader engine */
+                               {        0, {128,  128}},
+                               {        2, { 64,  128}},
+                               {        3, { 32,  128}},
+                               {        5, { 16,  128}},
+                               {       33, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               /* Two shader engines */
+                               {        0, {128,  128}},
+                               {        3, { 64,  128}},
+                               {        5, { 32,  128}},
+                               {        9, { 16,  128}},
+                               {       33, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               /* Four shader engines */
+                               {        0, {256,  256}},
+                               {        2, {128,  256}},
+                               {        3, {128,  128}},
+                               {        5, { 64,  128}},
+                               {        9, { 16,  128}},
+                               {       33, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+               },
+               {
+                       /* Four RB / SE */
+                       {
+                               /* One shader engine */
+                               {        0, {128,  256}},
+                               {        2, {128,  128}},
+                               {        3, { 64,  128}},
+                               {        5, { 32,  128}},
+                               {        9, { 16,  128}},
+                               {       33, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               /* Two shader engines */
+                               {        0, {256,  256}},
+                               {        2, {128,  256}},
+                               {        3, {128,  128}},
+                               {        5, { 64,  128}},
+                               {        9, { 32,  128}},
+                               {       17, { 16,  128}},
+                               {       33, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               /* Four shader engines */
+                               {        0, {256,  512}},
+                               {        2, {256,  256}},
+                               {        3, {128,  256}},
+                               {        5, {128,  128}},
+                               {        9, { 64,  128}},
+                               {       17, { 16,  128}},
+                               {       33, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+               },
+       };
+       static const struct radv_bin_size_entry ds_size_table[][3][9] = {
+               {
+                       // One RB / SE
+                       {
+                               // One shader engine
+                               {        0, {128,  256}},
+                               {        2, {128,  128}},
+                               {        4, { 64,  128}},
+                               {        7, { 32,  128}},
+                               {       13, { 16,  128}},
+                               {       49, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               // Two shader engines
+                               {        0, {256,  256}},
+                               {        2, {128,  256}},
+                               {        4, {128,  128}},
+                               {        7, { 64,  128}},
+                               {       13, { 32,  128}},
+                               {       25, { 16,  128}},
+                               {       49, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               // Four shader engines
+                               {        0, {256,  512}},
+                               {        2, {256,  256}},
+                               {        4, {128,  256}},
+                               {        7, {128,  128}},
+                               {       13, { 64,  128}},
+                               {       25, { 16,  128}},
+                               {       49, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+               },
+               {
+                       // Two RB / SE
+                       {
+                               // One shader engine
+                               {        0, {256,  256}},
+                               {        2, {128,  256}},
+                               {        4, {128,  128}},
+                               {        7, { 64,  128}},
+                               {       13, { 32,  128}},
+                               {       25, { 16,  128}},
+                               {       97, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               // Two shader engines
+                               {        0, {256,  512}},
+                               {        2, {256,  256}},
+                               {        4, {128,  256}},
+                               {        7, {128,  128}},
+                               {       13, { 64,  128}},
+                               {       25, { 32,  128}},
+                               {       49, { 16,  128}},
+                               {       97, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               // Four shader engines
+                               {        0, {512,  512}},
+                               {        2, {256,  512}},
+                               {        4, {256,  256}},
+                               {        7, {128,  256}},
+                               {       13, {128,  128}},
+                               {       25, { 64,  128}},
+                               {       49, { 16,  128}},
+                               {       97, {  0,    0}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+               },
+               {
+                       // Four RB / SE
+                       {
+                               // One shader engine
+                               {        0, {256,  512}},
+                               {        2, {256,  256}},
+                               {        4, {128,  256}},
+                               {        7, {128,  128}},
+                               {       13, { 64,  128}},
+                               {       25, { 32,  128}},
+                               {       49, { 16,  128}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               // Two shader engines
+                               {        0, {512,  512}},
+                               {        2, {256,  512}},
+                               {        4, {256,  256}},
+                               {        7, {128,  256}},
+                               {       13, {128,  128}},
+                               {       25, { 64,  128}},
+                               {       49, { 32,  128}},
+                               {       97, { 16,  128}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+                       {
+                               // Four shader engines
+                               {        0, {512,  512}},
+                               {        4, {256,  512}},
+                               {        7, {256,  256}},
+                               {       13, {128,  256}},
+                               {       25, {128,  128}},
+                               {       49, { 64,  128}},
+                               {       97, { 16,  128}},
+                               { UINT_MAX, {  0,    0}},
+                       },
+               },
+       };
+
+       RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
+       struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
+       VkExtent2D extent = {512, 512};
+
+       unsigned log_num_rb_per_se =
+           util_logbase2_ceil(pipeline->device->physical_device->rad_info.num_render_backends /
+                              pipeline->device->physical_device->rad_info.max_se);
+       unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);
+
+       unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_mode_cntl_1);
+       unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
+       unsigned effective_samples = total_samples;
+       unsigned cb_target_mask = pipeline->graphics.blend.cb_target_mask;
+       unsigned color_bytes_per_pixel = 0;
+
+       for (unsigned i = 0; i < subpass->color_count; i++) {
+               if (!(cb_target_mask & (0xf << (i * 4))))
+                       continue;
+
+               if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
+                       continue;
+
+               VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
+               color_bytes_per_pixel += vk_format_get_blocksize(format);
+       }
+
+       /* MSAA images typically don't use all samples all the time. */
+       if (effective_samples >= 2 && ps_iter_samples <= 1)
+               effective_samples = 2;
+       color_bytes_per_pixel *= effective_samples;
+
+       const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
+       while(color_entry->bpp <= color_bytes_per_pixel)
+               ++color_entry;
+
+       extent = color_entry->extent;
+
+       if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+               struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment.attachment;
+
+               /* Coefficients taken from AMDVLK */
+               unsigned depth_coeff = vk_format_is_depth(attachment->format) ? 5 : 0;
+               unsigned stencil_coeff = vk_format_is_stencil(attachment->format) ? 1 : 0;
+               unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
+
+               const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
+               while(ds_entry->bpp <= ds_bytes_per_pixel)
+                       ++ds_entry;
+
+               extent.width = MIN2(extent.width, ds_entry->extent.width);
+               extent.height = MIN2(extent.height, ds_entry->extent.height);
+       }
+
+       return extent;
+}
+
+static void
+radv_compute_binning_state(struct radv_pipeline *pipeline, const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+       pipeline->graphics.bin.pa_sc_binner_cntl_0 =
+                       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+                       S_028C44_DISABLE_START_OF_PRIM(1);
+       pipeline->graphics.bin.db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF);
+
+       if (!pipeline->device->pbb_allowed)
+               return;
+
+       VkExtent2D bin_size = radv_compute_bin_size(pipeline, pCreateInfo);
+       if (!bin_size.width || !bin_size.height)
+               return;
+
+       unsigned context_states_per_bin; /* allowed range: [1, 6] */
+       unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
+       unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
+
+       switch (pipeline->device->physical_device->rad_info.family) {
+       case CHIP_VEGA10:
+               context_states_per_bin = 1;
+               persistent_states_per_bin = 1;
+               fpovs_per_batch = 63;
+               break;
+       case CHIP_RAVEN:
+               context_states_per_bin = 6;
+               persistent_states_per_bin = 32;
+               fpovs_per_batch = 63;
+               break;
+       default:
+               unreachable("unhandled family while determining binning state.");
+       }
+
+       pipeline->graphics.bin.pa_sc_binner_cntl_0 =
+                       S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
+                       S_028C44_BIN_SIZE_X(bin_size.width == 16) |
+                       S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
+                       S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
+                       S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
+                       S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
+                       S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
+                       S_028C44_DISABLE_START_OF_PRIM(1) |
+                       S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
+                       S_028C44_OPTIMAL_BIN_SELECTION(1);
+
+       /* DFSM is not implemented yet */
+       assert(!pipeline->device->dfsm_allowed);
+}
 
 static VkResult
 radv_pipeline_init(struct radv_pipeline *pipeline,
@@ -2290,6 +2613,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                radv_dump_pipeline_stats(device, pipeline);
        }
 
+       radv_compute_binning_state(pipeline, pCreateInfo);
+
        result = radv_pipeline_scratch_init(device, pipeline);
        return result;
 }
index eb5a64d2536ee895ffd2192662ee275b386700c8..bae353c0e5f080ae4300cbeb6ad4c8f6f64f65df 100644 (file)
@@ -583,6 +583,7 @@ struct radv_device {
 
        bool llvm_supports_spill;
        bool has_distributed_tess;
+       bool pbb_allowed;
        bool dfsm_allowed;
        uint32_t tess_offchip_block_dw_size;
        uint32_t scratch_waves;
@@ -1165,6 +1166,11 @@ struct radv_vs_state {
        uint32_t vgt_reuse_off;
 };
 
+struct radv_binning_state {
+       uint32_t pa_sc_binner_cntl_0;
+       uint32_t db_dfsm_control;
+};
+
 #define SI_GS_PER_ES 128
 
 struct radv_pipeline {
@@ -1193,6 +1199,7 @@ struct radv_pipeline {
                        struct radv_tessellation_state tess;
                        struct radv_gs_state gs;
                        struct radv_vs_state vs;
+                       struct radv_binning_state bin;
                        uint32_t db_shader_control;
                        uint32_t shader_z_format;
                        unsigned prim;
index a6981c136e70c5ce4ed2b832cb52804378f0f659..68913ec2ad3caf45318b590372d82f7103877437 100644 (file)
@@ -518,12 +518,6 @@ si_emit_config(struct radv_physical_device *physical_device,
                        assert(0);
                }
 
-               radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
-                                      S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
-               /* TODO: Enable the binner: */
-               radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
-                                      S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-                                      S_028C44_DISABLE_START_OF_PRIM(1));
                radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1,
                                       S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) |
                                       S_028C48_MAX_PRIM_PER_BATCH(1023));