radv/gfx10: compute the LDS size for exporting PrimID for VS
[mesa.git] / src / amd / vulkan / radv_pipeline.c
index c0676706cd669503f76fe3a036d910dd09a5c56f..3e448db1fd532351deafd25ddcd38651cb31799e 100644 (file)
@@ -865,7 +865,7 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
                blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
        }
 
-       if (pipeline->device->physical_device->has_rbplus) {
+       if (pipeline->device->physical_device->rad_info.has_rbplus) {
                /* Disable RB+ blend optimizations for dual source blending. */
                if (blend.mrt0_is_dual_src) {
                        for (i = 0; i < 8; i++) {
@@ -1751,6 +1751,15 @@ calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
                if (es_info->info.so.num_outputs)
                        esvert_lds_size = 4 * es_info->info.so.num_outputs + 1;
                */
+
+               /* LDS size for passing data from GS to ES.
+                * GS stores Primitive IDs (one DWORD) into LDS at the address
+                * corresponding to the ES thread of the provoking vertex. All
+                * ES threads load and export PrimitiveID for their thread.
+                */
+               if (!radv_pipeline_has_tess(pipeline) &&
+                   pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.export_prim_id)
+                       esvert_lds_size = MAX2(esvert_lds_size, 1);
        }
 
        unsigned max_gsprims = max_gsprims_base;
@@ -2010,7 +2019,7 @@ calculate_tess_state(struct radv_pipeline *pipeline,
        else
                topology = V_028B6C_OUTPUT_TRIANGLE_CW;
 
-       if (pipeline->device->has_distributed_tess) {
+       if (pipeline->device->physical_device->rad_info.has_distributed_tess) {
                if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI ||
                    pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10)
                        distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
@@ -2320,6 +2329,7 @@ radv_fill_shader_keys(struct radv_device *device,
        }
 
        if (device->physical_device->rad_info.chip_class >= GFX10 &&
+           device->physical_device->rad_info.family != CHIP_NAVI14 &&
            !(device->instance->debug_flags & RADV_DEBUG_NO_NGG)) {
                if (nir[MESA_SHADER_TESS_CTRL]) {
                        keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = true;
@@ -2339,6 +2349,26 @@ radv_fill_shader_keys(struct radv_device *device,
                        keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
                }
 
+               /*
+                * Disable NGG with geometry shaders. There are a bunch of
+                * issues still:
+                *   * GS primitives in pipeline statistic queries do not get
+                *     updates. See dEQP-VK.query_pool.statistics_query.geometry_shader_primitives
+                *   * dEQP-VK.clipping.user_defined.clip_cull_distance_dynamic_index.*geom* failures
+                *   * Interactions with tessellation failing:
+                *     dEQP-VK.tessellation.geometry_interaction.passthrough.tessellate_isolines_passthrough_geometry_no_change
+                *   * General issues with the last primitive missing/corrupt:
+                *     https://bugs.freedesktop.org/show_bug.cgi?id=111248
+                *
+                * Furthermore, XGL/AMDVLK also disables this as of 9b632ef.
+                */
+               if (nir[MESA_SHADER_GEOMETRY]) {
+                       if (nir[MESA_SHADER_TESS_CTRL])
+                               keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
+                       else
+                               keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false;
+               }
+
                /* TODO: Implement streamout support for NGG. */
                gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
 
@@ -3191,7 +3221,7 @@ radv_pipeline_generate_binning_state(struct radeon_cmdbuf *ctx_cs,
                        fpovs_per_batch = 63;
                } else {
                        /* The context states are affected by the scissor bug. */
-                       context_states_per_bin = pipeline->device->physical_device->has_scissor_bug ? 1 : 6;
+                       context_states_per_bin = pipeline->device->physical_device->rad_info.has_gfx9_scissor_bug ? 1 : 6;
                        /* 32 causes hangs for RAVEN. */
                        persistent_states_per_bin = 16;
                        fpovs_per_batch = 63;
@@ -3308,7 +3338,7 @@ radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs,
        radeon_set_context_reg(ctx_cs, R_028808_CB_COLOR_CONTROL, blend->cb_color_control);
        radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
 
-       if (pipeline->device->physical_device->has_rbplus) {
+       if (pipeline->device->physical_device->rad_info.has_rbplus) {
 
                radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
                radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
@@ -3812,6 +3842,14 @@ radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs,
        else
                radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG,
                                       tess->ls_hs_config);
+
+       if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
+           !radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
+               radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
+                                      S_028A44_ES_VERTS_PER_SUBGRP(250) |
+                                      S_028A44_GS_PRIMS_PER_SUBGRP(126) |
+                                      S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
+       }
 }
 
 static void
@@ -4025,8 +4063,8 @@ radv_compute_db_shader_control(const struct radv_device *device,
        else
                z_order = V_02880C_LATE_Z;
 
-       bool disable_rbplus = device->physical_device->has_rbplus &&
-                             !device->physical_device->rbplus_allowed;
+       bool disable_rbplus = device->physical_device->rad_info.has_rbplus &&
+                             !device->physical_device->rad_info.rbplus_allowed;
 
        /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
         * but this appears to break Project Cars (DXVK). See
@@ -4349,7 +4387,7 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
                    radv_pipeline_has_gs(pipeline))
                        ia_multi_vgt_param.partial_vs_wave = true;
                /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
-               if (device->has_distributed_tess) {
+               if (device->physical_device->rad_info.has_distributed_tess) {
                        if (radv_pipeline_has_gs(pipeline)) {
                                if (device->physical_device->rad_info.chip_class <= GFX8)
                                        ia_multi_vgt_param.partial_es_wave = true;
@@ -4795,6 +4833,32 @@ static uint32_t radv_get_executable_count(const struct radv_pipeline *pipeline)
        return ret;
 }
 
+static struct radv_shader_variant *
+radv_get_shader_from_executable_index(const struct radv_pipeline *pipeline, int index, gl_shader_stage *stage)
+{
+       for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+               if (!pipeline->shaders[i])
+                       continue;
+               if (!index) {
+                       *stage = i;
+                       return pipeline->shaders[i];
+               }
+
+               --index;
+
+               if (i == MESA_SHADER_GEOMETRY) {
+                       if (!index) {
+                               *stage = i;
+                               return pipeline->gs_copy_shader;
+                       }
+                       --index;
+               }
+       }
+
+       *stage = -1;
+       return NULL;
+}
+
 /* Basically strlcpy (which does not exist on linux) specialized for
  * descriptions. */
 static void desc_copy(char *desc, const char *src) {
@@ -4821,7 +4885,7 @@ VkResult radv_GetPipelineExecutablePropertiesKHR(
        const uint32_t count = MIN2(total_count, *pExecutableCount);
        for (unsigned i = 0, executable_idx = 0;
             i < MESA_SHADER_STAGES && executable_idx < count; ++i) {
-               if (pipeline->shaders[i])
+               if (!pipeline->shaders[i])
                        continue;
                pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
                const char *name = NULL;
@@ -4879,10 +4943,11 @@ VkResult radv_GetPipelineExecutablePropertiesKHR(
                                break;
 
                        pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
-                       snprintf(pProperties[executable_idx].name, VK_MAX_DESCRIPTION_SIZE,
-                                "GS Copy Shader");
-                       snprintf(pProperties[executable_idx].description, VK_MAX_DESCRIPTION_SIZE,
-                                "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
+                       desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
+                       desc_copy(pProperties[executable_idx].description,
+                                 "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
+
+                       ++executable_idx;
                }
        }
 
@@ -4893,3 +4958,178 @@ VkResult radv_GetPipelineExecutablePropertiesKHR(
        *pExecutableCount = count;
        return result;
 }
+
+VkResult radv_GetPipelineExecutableStatisticsKHR(
+    VkDevice                                    _device,
+    const VkPipelineExecutableInfoKHR*          pExecutableInfo,
+    uint32_t*                                   pStatisticCount,
+    VkPipelineExecutableStatisticKHR*           pStatistics)
+{
+       RADV_FROM_HANDLE(radv_device, device, _device);
+       RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
+       gl_shader_stage stage;
+       struct radv_shader_variant *shader = radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
+
+       enum chip_class chip_class = device->physical_device->rad_info.chip_class;
+       unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
+       unsigned max_waves = radv_get_max_waves(device, shader, stage);
+
+       VkPipelineExecutableStatisticKHR *s = pStatistics;
+       VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
+       VkResult result = VK_SUCCESS;
+
+       if (s < end) {
+               desc_copy(s->name, "SGPRs");
+               desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->config.num_sgprs;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "VGPRs");
+               desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->config.num_vgprs;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "Spilled SGPRs");
+               desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->config.spilled_sgprs;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "Spilled VGPRs");
+               desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->config.spilled_vgprs;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "PrivMem VGPRs");
+               desc_copy(s->description, "Number of VGPRs stored in private memory per subgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->info.private_mem_vgprs;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "Code size");
+               desc_copy(s->description, "Code size in bytes");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->code_size;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "LDS size");
+               desc_copy(s->description, "LDS size in bytes per workgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->config.lds_size * lds_increment;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "Scratch size");
+               desc_copy(s->description, "Private memory in bytes per subgroup");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = shader->config.scratch_bytes_per_wave;
+       }
+       ++s;
+
+       if (s < end) {
+               desc_copy(s->name, "Subgroups per SIMD");
+               desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
+               s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+               s->value.u64 = max_waves;
+       }
+       ++s;
+
+       if (!pStatistics)
+               *pStatisticCount = s - pStatistics;
+       else if (s > end) {
+               *pStatisticCount = end - pStatistics;
+               result = VK_INCOMPLETE;
+       } else {
+               *pStatisticCount = s - pStatistics;
+       }
+
+       return result;
+}
+
+static VkResult radv_copy_representation(void *data, size_t *data_size, const char *src)
+{
+       size_t total_size  = strlen(src) + 1;
+
+       if (!data) {
+               *data_size = total_size;
+               return VK_SUCCESS;
+       }
+
+       size_t size = MIN2(total_size, *data_size);
+
+       memcpy(data, src, size);
+       if (size)
+               *((char*)data + size - 1) = 0;
+       return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
+}
+
+VkResult radv_GetPipelineExecutableInternalRepresentationsKHR(
+    VkDevice                                    device,
+    const VkPipelineExecutableInfoKHR*          pExecutableInfo,
+    uint32_t*                                   pInternalRepresentationCount,
+    VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
+{
+       RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
+       gl_shader_stage stage;
+       struct radv_shader_variant *shader = radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
+
+       VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
+       VkPipelineExecutableInternalRepresentationKHR *end = p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
+       VkResult result = VK_SUCCESS;
+       /* optimized NIR */
+       if (p < end) {
+               p->isText = true;
+               desc_copy(p->name, "NIR Shader(s)");
+               desc_copy(p->description, "The optimized NIR shader(s)");
+               if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
+                       result = VK_INCOMPLETE;
+       }
+       ++p;
+
+       /* LLVM IR */
+       if (p < end) {
+               p->isText = true;
+               desc_copy(p->name, "LLVM IR");
+               desc_copy(p->description, "The LLVM IR after some optimizations");
+               if (radv_copy_representation(p->pData, &p->dataSize, shader->llvm_ir_string) != VK_SUCCESS)
+                       result = VK_INCOMPLETE;
+       }
+       ++p;
+
+       /* Disassembler */
+       if (p < end) {
+               p->isText = true;
+               desc_copy(p->name, "Assembly");
+               desc_copy(p->description, "Final Assembly");
+               if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
+                       result = VK_INCOMPLETE;
+       }
+       ++p;
+
+       if (!pInternalRepresentations)
+               *pInternalRepresentationCount = p - pInternalRepresentations;
+       else if(p > end) {
+               result = VK_INCOMPLETE;
+               *pInternalRepresentationCount = end - pInternalRepresentations;
+       } else {
+               *pInternalRepresentationCount = p - pInternalRepresentations;
+       }
+
+       return result;
+}