src/amd/vulkan/radv_shader.c

   1 /*
   2  * Copyright © 2016 Red Hat.
   3  * Copyright © 2016 Bas Nieuwenhuizen
   4  *
   5  * based in part on anv driver which is:
   6  * Copyright © 2015 Intel Corporation
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a
   9  * copy of this software and associated documentation files (the "Software"),
  10  * to deal in the Software without restriction, including without limitation
  11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12  * and/or sell copies of the Software, and to permit persons to whom the
  13  * Software is furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the next
  16  * paragraph) shall be included in all copies or substantial portions of the
  17  * Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  25  * IN THE SOFTWARE.
  26  */
  27
  28 #include "util/mesa-sha1.h"
  29 #include "util/u_atomic.h"
  30 #include "radv_debug.h"
  31 #include "radv_private.h"
  32 #include "radv_shader.h"
  33 #include "radv_shader_helper.h"
  34 #include "radv_shader_args.h"
  35 #include "nir/nir.h"
  36 #include "nir/nir_builder.h"
  37 #include "spirv/nir_spirv.h"
  38
  39 #include "sid.h"
  40 #include "ac_binary.h"
  41 #include "ac_llvm_util.h"
  42 #include "ac_nir_to_llvm.h"
  43 #include "ac_rtld.h"
  44 #include "vk_format.h"
  45 #include "util/debug.h"
  46 #include "ac_exp_param.h"
  47
  48 #include "aco_interface.h"
  49
  50 static const struct nir_shader_compiler_options nir_options_llvm = {
  51         .vertex_id_zero_based = true,
  52         .lower_scmp = true,
  53         .lower_flrp16 = true,
  54         .lower_flrp32 = true,
  55         .lower_flrp64 = true,
  56         .lower_device_index_to_zero = true,
  57         .lower_fsat = true,
  58         .lower_fdiv = true,
  59         .lower_fmod = true,
  60         .lower_bitfield_insert_to_bitfield_select = true,
  61         .lower_bitfield_extract = true,
  62         .lower_sub = true,
  63         .lower_pack_snorm_2x16 = true,
  64         .lower_pack_snorm_4x8 = true,
  65         .lower_pack_unorm_2x16 = true,
  66         .lower_pack_unorm_4x8 = true,
  67         .lower_unpack_snorm_2x16 = true,
  68         .lower_unpack_snorm_4x8 = true,
  69         .lower_unpack_unorm_2x16 = true,
  70         .lower_unpack_unorm_4x8 = true,
  71         .lower_extract_byte = true,
  72         .lower_extract_word = true,
  73         .lower_ffma = true,
  74         .lower_fpow = true,
  75         .lower_mul_2x32_64 = true,
  76         .lower_rotate = true,
  77         .use_scoped_barrier = true,
  78         .max_unroll_iterations = 32,
  79         .use_interpolated_input_intrinsics = true,
  80         /* nir_lower_int64() isn't actually called for the LLVM backend, but
  81          * this helps the loop unrolling heuristics. */
  82         .lower_int64_options = nir_lower_imul64 |
  83                                nir_lower_imul_high64 |
  84                                nir_lower_imul_2x32_64 |
  85                                nir_lower_divmod64 |
  86                                nir_lower_minmax64 |
  87                                nir_lower_iabs64,
  88         .lower_doubles_options = nir_lower_drcp |
  89                                  nir_lower_dsqrt |
  90                                  nir_lower_drsq |
  91                                  nir_lower_ddiv,
  92 };
  93
  94 static const struct nir_shader_compiler_options nir_options_aco = {
  95         .vertex_id_zero_based = true,
  96         .lower_scmp = true,
  97         .lower_flrp16 = true,
  98         .lower_flrp32 = true,
  99         .lower_flrp64 = true,
 100         .lower_device_index_to_zero = true,
 101         .lower_fdiv = true,
 102         .lower_fmod = true,
 103         .lower_bitfield_insert_to_bitfield_select = true,
 104         .lower_bitfield_extract = true,
 105         .lower_pack_snorm_2x16 = true,
 106         .lower_pack_snorm_4x8 = true,
 107         .lower_pack_unorm_2x16 = true,
 108         .lower_pack_unorm_4x8 = true,
 109         .lower_unpack_snorm_2x16 = true,
 110         .lower_unpack_snorm_4x8 = true,
 111         .lower_unpack_unorm_2x16 = true,
 112         .lower_unpack_unorm_4x8 = true,
 113         .lower_unpack_half_2x16 = true,
 114         .lower_extract_byte = true,
 115         .lower_extract_word = true,
 116         .lower_ffma = true,
 117         .lower_fpow = true,
 118         .lower_mul_2x32_64 = true,
 119         .lower_rotate = true,
 120         .use_scoped_barrier = true,
 121         .max_unroll_iterations = 32,
 122         .use_interpolated_input_intrinsics = true,
 123         .lower_int64_options = nir_lower_imul64 |
 124                                nir_lower_imul_high64 |
 125                                nir_lower_imul_2x32_64 |
 126                                nir_lower_divmod64 |
 127                                nir_lower_minmax64 |
 128                                nir_lower_iabs64,
 129         .lower_doubles_options = nir_lower_drcp |
 130                                  nir_lower_dsqrt |
 131                                  nir_lower_drsq |
 132                                  nir_lower_ddiv,
 133 };
 134
 135 bool
 136 radv_can_dump_shader(struct radv_device *device,
 137                      struct radv_shader_module *module,
 138                      bool is_gs_copy_shader)
 139 {
 140         if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
 141                 return false;
 142         if (module)
 143                 return !module->nir ||
 144                         (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS);
 145
 146         return is_gs_copy_shader;
 147 }
 148
 149 bool
 150 radv_can_dump_shader_stats(struct radv_device *device,
 151                            struct radv_shader_module *module)
 152 {
 153         /* Only dump non-meta shader stats. */
 154         return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS &&
 155                module && !module->nir;
 156 }
 157
 158 VkResult radv_CreateShaderModule(
 159         VkDevice                                    _device,
 160         const VkShaderModuleCreateInfo*             pCreateInfo,
 161         const VkAllocationCallbacks*                pAllocator,
 162         VkShaderModule*                             pShaderModule)
 163 {
 164         RADV_FROM_HANDLE(radv_device, device, _device);
 165         struct radv_shader_module *module;
 166
 167         assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO);
 168         assert(pCreateInfo->flags == 0);
 169
 170         module = vk_alloc2(&device->vk.alloc, pAllocator,
 171                              sizeof(*module) + pCreateInfo->codeSize, 8,
 172                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 173         if (module == NULL)
 174                 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 175
 176         vk_object_base_init(&device->vk, &module->base,
 177                             VK_OBJECT_TYPE_SHADER_MODULE);
 178
 179         module->nir = NULL;
 180         module->size = pCreateInfo->codeSize;
 181         memcpy(module->data, pCreateInfo->pCode, module->size);
 182
 183         _mesa_sha1_compute(module->data, module->size, module->sha1);
 184
 185         *pShaderModule = radv_shader_module_to_handle(module);
 186
 187         return VK_SUCCESS;
 188 }
 189
 190 void radv_DestroyShaderModule(
 191         VkDevice                                    _device,
 192         VkShaderModule                              _module,
 193         const VkAllocationCallbacks*                pAllocator)
 194 {
 195         RADV_FROM_HANDLE(radv_device, device, _device);
 196         RADV_FROM_HANDLE(radv_shader_module, module, _module);
 197
 198         if (!module)
 199                 return;
 200
 201         vk_object_base_finish(&module->base);
 202         vk_free2(&device->vk.alloc, pAllocator, module);
 203 }
 204
 205 void
 206 radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
 207                   bool allow_copies)
 208 {
 209         bool progress;
 210         unsigned lower_flrp =
 211                 (shader->options->lower_flrp16 ? 16 : 0) |
 212                 (shader->options->lower_flrp32 ? 32 : 0) |
 213                 (shader->options->lower_flrp64 ? 64 : 0);
 214
 215         do {
 216                 progress = false;
 217
 218                 NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp);
 219                 NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp);
 220
 221                 NIR_PASS_V(shader, nir_lower_vars_to_ssa);
 222                 NIR_PASS_V(shader, nir_lower_pack);
 223
 224                 if (allow_copies) {
 225                         /* Only run this pass in the first call to
 226                          * radv_optimize_nir.  Later calls assume that we've
 227                          * lowered away any copy_deref instructions and we
 228                          *  don't want to introduce any more.
 229                         */
 230                         NIR_PASS(progress, shader, nir_opt_find_array_copies);
 231                 }
 232
 233                 NIR_PASS(progress, shader, nir_opt_copy_prop_vars);
 234                 NIR_PASS(progress, shader, nir_opt_dead_write_vars);
 235                 NIR_PASS(progress, shader, nir_remove_dead_variables,
 236                          nir_var_function_temp | nir_var_shader_in | nir_var_shader_out,
 237                          NULL);
 238
 239                 NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL);
 240                 NIR_PASS_V(shader, nir_lower_phis_to_scalar);
 241
 242                 NIR_PASS(progress, shader, nir_copy_prop);
 243                 NIR_PASS(progress, shader, nir_opt_remove_phis);
 244                 NIR_PASS(progress, shader, nir_opt_dce);
 245                 if (nir_opt_trivial_continues(shader)) {
 246                         progress = true;
 247                         NIR_PASS(progress, shader, nir_copy_prop);
 248                         NIR_PASS(progress, shader, nir_opt_remove_phis);
 249                         NIR_PASS(progress, shader, nir_opt_dce);
 250                 }
 251                 NIR_PASS(progress, shader, nir_opt_if, true);
 252                 NIR_PASS(progress, shader, nir_opt_dead_cf);
 253                 NIR_PASS(progress, shader, nir_opt_cse);
 254                 NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
 255                 NIR_PASS(progress, shader, nir_opt_constant_folding);
 256                 NIR_PASS(progress, shader, nir_opt_algebraic);
 257
 258                 if (lower_flrp != 0) {
 259                         bool lower_flrp_progress = false;
 260                         NIR_PASS(lower_flrp_progress,
 261                                  shader,
 262                                  nir_lower_flrp,
 263                                  lower_flrp,
 264                                  false /* always_precise */);
 265                         if (lower_flrp_progress) {
 266                                 NIR_PASS(progress, shader,
 267                                          nir_opt_constant_folding);
 268                                 progress = true;
 269                         }
 270
 271                         /* Nothing should rematerialize any flrps, so we only
 272                          * need to do this lowering once.
 273                          */
 274                         lower_flrp = 0;
 275                 }
 276
 277                 NIR_PASS(progress, shader, nir_opt_undef);
 278                 NIR_PASS(progress, shader, nir_opt_shrink_vectors);
 279                 if (shader->options->max_unroll_iterations) {
 280                         NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
 281                 }
 282         } while (progress && !optimize_conservatively);
 283
 284         NIR_PASS(progress, shader, nir_opt_conditional_discard);
 285         NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo);
 286 }
 287
 288 static void
 289 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 290 {
 291         assert(glsl_type_is_vector_or_scalar(type));
 292
 293         uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
 294         unsigned length = glsl_get_vector_elements(type);
 295         *size = comp_size * length,
 296         *align = comp_size;
 297 }
 298
 299 struct radv_shader_debug_data {
 300         struct radv_device *device;
 301         const struct radv_shader_module *module;
 302 };
 303
 304 static void radv_spirv_nir_debug(void *private_data,
 305                                  enum nir_spirv_debug_level level,
 306                                  size_t spirv_offset,
 307                                  const char *message)
 308 {
 309         struct radv_shader_debug_data *debug_data = private_data;
 310         struct radv_instance *instance = debug_data->device->instance;
 311
 312         static const VkDebugReportFlagsEXT vk_flags[] = {
 313                 [NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT,
 314                 [NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT,
 315                 [NIR_SPIRV_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
 316         };
 317         char buffer[256];
 318
 319         snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s",
 320                  (unsigned long)spirv_offset, message);
 321
 322         vk_debug_report(&instance->debug_report_callbacks,
 323                         vk_flags[level],
 324                         VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT,
 325                         (uint64_t)(uintptr_t)debug_data->module,
 326                         0, 0, "radv", buffer);
 327 }
 328
 329 static void radv_compiler_debug(void *private_data,
 330                                 enum radv_compiler_debug_level level,
 331                                 const char *message)
 332 {
 333         struct radv_shader_debug_data *debug_data = private_data;
 334         struct radv_instance *instance = debug_data->device->instance;
 335
 336         static const VkDebugReportFlagsEXT vk_flags[] = {
 337                 [RADV_COMPILER_DEBUG_LEVEL_PERFWARN] = VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
 338                 [RADV_COMPILER_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
 339         };
 340
 341         /* VK_DEBUG_REPORT_DEBUG_BIT_EXT specifies diagnostic information
 342          * from the implementation and layers.
 343          */
 344         vk_debug_report(&instance->debug_report_callbacks,
 345                         vk_flags[level] | VK_DEBUG_REPORT_DEBUG_BIT_EXT,
 346                         VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT,
 347                         (uint64_t)(uintptr_t)debug_data->module,
 348                         0, 0, "radv", message);
 349 }
 350
 351 nir_shader *
 352 radv_shader_compile_to_nir(struct radv_device *device,
 353                            struct radv_shader_module *module,
 354                            const char *entrypoint_name,
 355                            gl_shader_stage stage,
 356                            const VkSpecializationInfo *spec_info,
 357                            const VkPipelineCreateFlags flags,
 358                            const struct radv_pipeline_layout *layout,
 359                            unsigned subgroup_size, unsigned ballot_bit_size)
 360 {
 361         nir_shader *nir;
 362         const nir_shader_compiler_options *nir_options =
 363                 radv_use_llvm_for_stage(device, stage) ? &nir_options_llvm : &nir_options_aco;
 364
 365         if (module->nir) {
 366                 /* Some things such as our meta clear/blit code will give us a NIR
 367                  * shader directly.  In that case, we just ignore the SPIR-V entirely
 368                  * and just use the NIR shader */
 369                 nir = module->nir;
 370                 nir->options = nir_options;
 371                 nir_validate_shader(nir, "in internal shader");
 372
 373                 assert(exec_list_length(&nir->functions) == 1);
 374         } else {
 375                 uint32_t *spirv = (uint32_t *) module->data;
 376                 assert(module->size % 4 == 0);
 377
 378                 if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV)
 379                         radv_print_spirv(module->data, module->size, stderr);
 380
 381                 uint32_t num_spec_entries = 0;
 382                 struct nir_spirv_specialization *spec_entries = NULL;
 383                 if (spec_info && spec_info->mapEntryCount > 0) {
 384                         num_spec_entries = spec_info->mapEntryCount;
 385                         spec_entries = calloc(num_spec_entries, sizeof(*spec_entries));
 386                         for (uint32_t i = 0; i < num_spec_entries; i++) {
 387                                 VkSpecializationMapEntry entry = spec_info->pMapEntries[i];
 388                                 const void *data = spec_info->pData + entry.offset;
 389                                 assert(data + entry.size <= spec_info->pData + spec_info->dataSize);
 390
 391                                 spec_entries[i].id = spec_info->pMapEntries[i].constantID;
 392                                 switch (entry.size) {
 393                                 case 8:
 394                                         spec_entries[i].value.u64 = *(const uint64_t *)data;
 395                                         break;
 396                                 case 4:
 397                                         spec_entries[i].value.u32 = *(const uint32_t *)data;
 398                                         break;
 399                                 case 2:
 400                                         spec_entries[i].value.u16 = *(const uint16_t *)data;
 401                                         break;
 402                                 case 1:
 403                                         spec_entries[i].value.u8 = *(const uint8_t *)data;
 404                                         break;
 405                                 default:
 406                                         assert(!"Invalid spec constant size");
 407                                         break;
 408                                 }
 409                         }
 410                 }
 411
 412                 struct radv_shader_debug_data spirv_debug_data = {
 413                         .device = device,
 414                         .module = module,
 415                 };
 416                 const struct spirv_to_nir_options spirv_options = {
 417                         .lower_ubo_ssbo_access_to_offsets = true,
 418                         .caps = {
 419                                 .amd_fragment_mask = true,
 420                                 .amd_gcn_shader = true,
 421                                 .amd_image_gather_bias_lod = true,
 422                                 .amd_image_read_write_lod = true,
 423                                 .amd_shader_ballot = true,
 424                                 .amd_shader_explicit_vertex_parameter = true,
 425                                 .amd_trinary_minmax = true,
 426                                 .demote_to_helper_invocation = true,
 427                                 .derivative_group = true,
 428                                 .descriptor_array_dynamic_indexing = true,
 429                                 .descriptor_array_non_uniform_indexing = true,
 430                                 .descriptor_indexing = true,
 431                                 .device_group = true,
 432                                 .draw_parameters = true,
 433                                 .float_controls = true,
 434                                 .float16 = device->physical_device->rad_info.has_packed_math_16bit,
 435                                 .float32_atomic_add = true,
 436                                 .float64 = true,
 437                                 .geometry_streams = true,
 438                                 .image_ms_array = true,
 439                                 .image_read_without_format = true,
 440                                 .image_write_without_format = true,
 441                                 .int8 = true,
 442                                 .int16 = true,
 443                                 .int64 = true,
 444                                 .int64_atomics = true,
 445                                 .min_lod = true,
 446                                 .multiview = true,
 447                                 .physical_storage_buffer_address = true,
 448                                 .post_depth_coverage = true,
 449                                 .runtime_descriptor_array = true,
 450                                 .shader_clock = true,
 451                                 .shader_viewport_index_layer = true,
 452                                 .stencil_export = true,
 453                                 .storage_8bit = true,
 454                                 .storage_16bit = true,
 455                                 .storage_image_ms = true,
 456                                 .subgroup_arithmetic = true,
 457                                 .subgroup_ballot = true,
 458                                 .subgroup_basic = true,
 459                                 .subgroup_quad = true,
 460                                 .subgroup_shuffle = true,
 461                                 .subgroup_vote = true,
 462                                 .tessellation = true,
 463                                 .transform_feedback = true,
 464                                 .variable_pointers = true,
 465                                 .vk_memory_model = true,
 466                                 .vk_memory_model_device_scope = true,
 467                         },
 468                         .ubo_addr_format = nir_address_format_32bit_index_offset,
 469                         .ssbo_addr_format = nir_address_format_32bit_index_offset,
 470                         .phys_ssbo_addr_format = nir_address_format_64bit_global,
 471                         .push_const_addr_format = nir_address_format_logical,
 472                         .shared_addr_format = nir_address_format_32bit_offset,
 473                         .frag_coord_is_sysval = true,
 474                         .debug = {
 475                                 .func = radv_spirv_nir_debug,
 476                                 .private_data = &spirv_debug_data,
 477                         },
 478                 };
 479                 nir = spirv_to_nir(spirv, module->size / 4,
 480                                    spec_entries, num_spec_entries,
 481                                    stage, entrypoint_name,
 482                                    &spirv_options, nir_options);
 483                 assert(nir->info.stage == stage);
 484                 nir_validate_shader(nir, "after spirv_to_nir");
 485
 486                 free(spec_entries);
 487
 488                 /* We have to lower away local constant initializers right before we
 489                  * inline functions.  That way they get properly initialized at the top
 490                  * of the function and not at the top of its caller.
 491                  */
 492                 NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
 493                 NIR_PASS_V(nir, nir_lower_returns);
 494                 NIR_PASS_V(nir, nir_inline_functions);
 495                 NIR_PASS_V(nir, nir_copy_prop);
 496                 NIR_PASS_V(nir, nir_opt_deref);
 497
 498                 /* Pick off the single entrypoint that we want */
 499                 foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
 500                         if (func->is_entrypoint)
 501                                 func->name = ralloc_strdup(func, "main");
 502                         else
 503                                 exec_node_remove(&func->node);
 504                 }
 505                 assert(exec_list_length(&nir->functions) == 1);
 506
 507                 /* Make sure we lower constant initializers on output variables so that
 508                  * nir_remove_dead_variables below sees the corresponding stores
 509                  */
 510                 NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);
 511
 512                 /* Now that we've deleted all but the main function, we can go ahead and
 513                  * lower the rest of the constant initializers.
 514                  */
 515                 NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
 516
 517                 /* Split member structs.  We do this before lower_io_to_temporaries so that
 518                  * it doesn't lower system values to temporaries by accident.
 519                  */
 520                 NIR_PASS_V(nir, nir_split_var_copies);
 521                 NIR_PASS_V(nir, nir_split_per_member_structs);
 522
 523                 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
 524                     !radv_use_llvm_for_stage(device, nir->info.stage))
 525                         NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
 526                 if (nir->info.stage == MESA_SHADER_FRAGMENT)
 527                         NIR_PASS_V(nir, nir_lower_input_attachments,
 528                                    &(nir_input_attachment_options) {
 529                                         .use_fragcoord_sysval = true,
 530                                         .use_layer_id_sysval = false,
 531                                    });
 532
 533                 NIR_PASS_V(nir, nir_remove_dead_variables,
 534                            nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
 535                            NULL);
 536
 537                 NIR_PASS_V(nir, nir_propagate_invariant);
 538
 539                 NIR_PASS_V(nir, nir_lower_system_values);
 540                 NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
 541
 542                 NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
 543
 544                 if (device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
 545                         NIR_PASS_V(nir, nir_lower_discard_to_demote);
 546
 547                 nir_lower_doubles_options lower_doubles =
 548                         nir->options->lower_doubles_options;
 549
 550                 if (device->physical_device->rad_info.chip_class == GFX6) {
 551                         /* GFX6 doesn't support v_floor_f64 and the precision
 552                          * of v_fract_f64 which is used to implement 64-bit
 553                          * floor is less than what Vulkan requires.
 554                          */
 555                         lower_doubles |= nir_lower_dfloor;
 556                 }
 557
 558                 NIR_PASS_V(nir, nir_lower_doubles, NULL, lower_doubles);
 559         }
 560
 561         /* Vulkan uses the separate-shader linking model */
 562         nir->info.separate_shader = true;
 563
 564         nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 565
 566         if (nir->info.stage == MESA_SHADER_GEOMETRY)
 567                 nir_lower_gs_intrinsics(nir, true);
 568
 569         static const nir_lower_tex_options tex_options = {
 570           .lower_txp = ~0,
 571           .lower_tg4_offsets = true,
 572         };
 573
 574         nir_lower_tex(nir, &tex_options);
 575
 576         nir_lower_vars_to_ssa(nir);
 577
 578         if (nir->info.stage == MESA_SHADER_VERTEX ||
 579             nir->info.stage == MESA_SHADER_GEOMETRY ||
 580             nir->info.stage == MESA_SHADER_FRAGMENT) {
 581                 NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 582                            nir_shader_get_entrypoint(nir), true, true);
 583         } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
 584                 NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 585                            nir_shader_get_entrypoint(nir), true, false);
 586         }
 587
 588         nir_split_var_copies(nir);
 589
 590         nir_lower_global_vars_to_local(nir);
 591         nir_remove_dead_variables(nir, nir_var_function_temp, NULL);
 592         bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7;
 593         nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) {
 594                         .subgroup_size = subgroup_size,
 595                         .ballot_bit_size = ballot_bit_size,
 596                         .lower_to_scalar = 1,
 597                         .lower_subgroup_masks = 1,
 598                         .lower_shuffle = 1,
 599                         .lower_shuffle_to_32bit = 1,
 600                         .lower_vote_eq_to_ballot = 1,
 601                         .lower_quad_broadcast_dynamic = 1,
 602                         .lower_quad_broadcast_dynamic_to_const = gfx7minus,
 603                         .lower_shuffle_to_swizzle_amd = 1,
 604                 });
 605
 606         nir_lower_load_const_to_scalar(nir);
 607
 608         if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
 609                 radv_optimize_nir(nir, false, true);
 610
 611         /* call radv_nir_lower_ycbcr_textures() late as there might still be
 612          * tex with undef texture/sampler before first optimization */
 613         NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout);
 614
 615         /* We call nir_lower_var_copies() after the first radv_optimize_nir()
 616          * to remove any copies introduced by nir_opt_find_array_copies().
 617          */
 618         nir_lower_var_copies(nir);
 619
 620         /* Lower deref operations for compute shared memory. */
 621         if (nir->info.stage == MESA_SHADER_COMPUTE) {
 622                 NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
 623                            nir_var_mem_shared, shared_var_info);
 624                 NIR_PASS_V(nir, nir_lower_explicit_io,
 625                            nir_var_mem_shared, nir_address_format_32bit_offset);
 626         }
 627
 628         /* Lower large variables that are always constant with load_constant
 629          * intrinsics, which get turned into PC-relative loads from a data
 630          * section next to the shader.
 631          */
 632         NIR_PASS_V(nir, nir_opt_large_constants,
 633                    glsl_get_natural_size_align_bytes, 16);
 634
 635         /* Indirect lowering must be called after the radv_optimize_nir() loop
 636          * has been called at least once. Otherwise indirect lowering can
 637          * bloat the instruction count of the loop and cause it to be
 638          * considered too large for unrolling.
 639          */
 640         ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class);
 641         radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT, false);
 642
 643         return nir;
 644 }
 645
 646 static int
 647 type_size_vec4(const struct glsl_type *type, bool bindless)
 648 {
 649         return glsl_count_attribute_slots(type, false);
 650 }
 651
 652 static nir_variable *
 653 find_layer_in_var(nir_shader *nir)
 654 {
 655         nir_variable *var =
 656                 nir_find_variable_with_location(nir, nir_var_shader_in, VARYING_SLOT_LAYER);
 657         if (var != NULL)
 658                 return var;
 659
 660         var = nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id");
 661         var->data.location = VARYING_SLOT_LAYER;
 662         var->data.interpolation = INTERP_MODE_FLAT;
 663         return var;
 664 }
 665
 666 /* We use layered rendering to implement multiview, which means we need to map
 667  * view_index to gl_Layer. The code generates a load from the layer_id sysval,
 668  * but since we don't have a way to get at this information from the fragment
 669  * shader, we also need to lower this to the gl_Layer varying.  This pass
 670  * lowers both to a varying load from the LAYER slot, before lowering io, so
 671  * that nir_assign_var_locations() will give the LAYER varying the correct
 672  * driver_location.
 673  */
 674
 675 static bool
 676 lower_view_index(nir_shader *nir)
 677 {
 678         bool progress = false;
 679         nir_function_impl *entry = nir_shader_get_entrypoint(nir);
 680         nir_builder b;
 681         nir_builder_init(&b, entry);
 682
 683         nir_variable *layer = NULL;
 684         nir_foreach_block(block, entry) {
 685                 nir_foreach_instr_safe(instr, block) {
 686                         if (instr->type != nir_instr_type_intrinsic)
 687                                 continue;
 688
 689                         nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
 690                         if (load->intrinsic != nir_intrinsic_load_view_index)
 691                                 continue;
 692
 693                         if (!layer)
 694                                 layer = find_layer_in_var(nir);
 695
 696                         b.cursor = nir_before_instr(instr);
 697                         nir_ssa_def *def = nir_load_var(&b, layer);
 698                         nir_ssa_def_rewrite_uses(&load->dest.ssa,
 699                                                  nir_src_for_ssa(def));
 700
 701                         nir_instr_remove(instr);
 702                         progress = true;
 703                 }
 704         }
 705
 706         return progress;
 707 }
 708
 709 void
 710 radv_lower_fs_io(nir_shader *nir)
 711 {
 712         NIR_PASS_V(nir, lower_view_index);
 713         nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
 714                                     MESA_SHADER_FRAGMENT);
 715
 716         NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
 717
 718         /* This pass needs actual constants */
 719         nir_opt_constant_folding(nir);
 720
 721         NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in);
 722 }
 723
 724
 725 static void *
 726 radv_alloc_shader_memory(struct radv_device *device,
 727                          struct radv_shader_variant *shader)
 728 {
 729         mtx_lock(&device->shader_slab_mutex);
 730         list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs) {
 731                 uint64_t offset = 0;
 732                 list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list) {
 733                         if (s->bo_offset - offset >= shader->code_size) {
 734                                 shader->bo = slab->bo;
 735                                 shader->bo_offset = offset;
 736                                 list_addtail(&shader->slab_list, &s->slab_list);
 737                                 mtx_unlock(&device->shader_slab_mutex);
 738                                 return slab->ptr + offset;
 739                         }
 740                         offset = align_u64(s->bo_offset + s->code_size, 256);
 741                 }
 742                 if (offset <= slab->size && slab->size - offset >= shader->code_size) {
 743                         shader->bo = slab->bo;
 744                         shader->bo_offset = offset;
 745                         list_addtail(&shader->slab_list, &slab->shaders);
 746                         mtx_unlock(&device->shader_slab_mutex);
 747                         return slab->ptr + offset;
 748                 }
 749         }
 750
 751         mtx_unlock(&device->shader_slab_mutex);
 752         struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab));
 753
 754         slab->size = MAX2(256 * 1024, shader->code_size);
 755         slab->bo = device->ws->buffer_create(device->ws, slab->size, 256,
 756                                              RADEON_DOMAIN_VRAM,
 757                                              RADEON_FLAG_NO_INTERPROCESS_SHARING |
 758                                              (device->physical_device->rad_info.cpdma_prefetch_writes_memory ?
 759                                                      0 : RADEON_FLAG_READ_ONLY),
 760                                              RADV_BO_PRIORITY_SHADER);
 761         if (!slab->bo) {
 762                 free(slab);
 763                 return NULL;
 764         }
 765
 766         slab->ptr = (char*)device->ws->buffer_map(slab->bo);
 767         if (!slab->ptr) {
 768                 device->ws->buffer_destroy(slab->bo);
 769                 free(slab);
 770                 return NULL;
 771         }
 772
 773         list_inithead(&slab->shaders);
 774
 775         mtx_lock(&device->shader_slab_mutex);
 776         list_add(&slab->slabs, &device->shader_slabs);
 777
 778         shader->bo = slab->bo;
 779         shader->bo_offset = 0;
 780         list_add(&shader->slab_list, &slab->shaders);
 781         mtx_unlock(&device->shader_slab_mutex);
 782         return slab->ptr;
 783 }
 784
 785 void
 786 radv_destroy_shader_slabs(struct radv_device *device)
 787 {
 788         list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs) {
 789                 device->ws->buffer_destroy(slab->bo);
 790                 free(slab);
 791         }
 792         mtx_destroy(&device->shader_slab_mutex);
 793 }
 794
 795 /* For the UMR disassembler. */
 796 #define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
 797 #define DEBUGGER_NUM_MARKERS           5
 798
 799 static unsigned
 800 radv_get_shader_binary_size(size_t code_size)
 801 {
 802         return code_size + DEBUGGER_NUM_MARKERS * 4;
 803 }
 804
 805 static void radv_postprocess_config(const struct radv_device *device,
 806                                     const struct ac_shader_config *config_in,
 807                                     const struct radv_shader_info *info,
 808                                     gl_shader_stage stage,
 809                                     struct ac_shader_config *config_out)
 810 {
 811         const struct radv_physical_device *pdevice = device->physical_device;
 812         bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
 813         bool trap_enabled = !!device->trap_handler_shader;
 814         unsigned vgpr_comp_cnt = 0;
 815         unsigned num_input_vgprs = info->num_input_vgprs;
 816
 817         if (stage == MESA_SHADER_FRAGMENT) {
 818                 num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL);
 819         }
 820
 821         unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs);
 822         /* +3 for scratch wave offset and VCC */
 823         unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3);
 824         unsigned num_shared_vgprs = config_in->num_shared_vgprs;
 825         /* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */
 826         assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0)
 827                || (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0));
 828         unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8;
 829         unsigned excp_en = 0;
 830
 831         *config_out = *config_in;
 832         config_out->num_vgprs = num_vgprs;
 833         config_out->num_sgprs = num_sgprs;
 834         config_out->num_shared_vgprs = num_shared_vgprs;
 835
 836         config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) |
 837                             S_00B12C_SCRATCH_EN(scratch_enabled) |
 838                             S_00B12C_TRAP_PRESENT(trap_enabled);
 839
 840         if (trap_enabled) {
 841                 /* Configure the shader exceptions like memory violation, etc.
 842                  * TODO: Enable (and validate) more exceptions.
 843                  */
 844                 excp_en = 1 << 8; /* mem_viol */
 845         }
 846
 847         if (!pdevice->use_ngg_streamout) {
 848                 config_out->rsrc2 |= S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) |
 849                                      S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) |
 850                                      S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) |
 851                                      S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) |
 852                                      S_00B12C_SO_EN(!!info->so.num_outputs);
 853         }
 854
 855         config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) /
 856                                            (info->wave_size == 32 ? 8 : 4)) |
 857                             S_00B848_DX10_CLAMP(1) |
 858                             S_00B848_FLOAT_MODE(config_out->float_mode);
 859
 860         if (pdevice->rad_info.chip_class >= GFX10) {
 861                 config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5);
 862         } else {
 863                 config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8);
 864                 config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5);
 865         }
 866
 867         switch (stage) {
 868         case MESA_SHADER_TESS_EVAL:
 869                 if (info->is_ngg) {
 870                         config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
 871                         config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1) |
 872                                              S_00B22C_EXCP_EN(excp_en);
 873                 } else if (info->tes.as_es) {
 874                         assert(pdevice->rad_info.chip_class <= GFX8);
 875                         vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
 876
 877                         config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) |
 878                                              S_00B12C_EXCP_EN(excp_en);
 879                 } else {
 880                         bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
 881                         vgpr_comp_cnt = enable_prim_id ? 3 : 2;
 882
 883                         config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
 884                         config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) |
 885                                              S_00B12C_EXCP_EN(excp_en);
 886                 }
 887                 config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
 888                 break;
 889         case MESA_SHADER_TESS_CTRL:
 890                 if (pdevice->rad_info.chip_class >= GFX9) {
 891                         /* We need at least 2 components for LS.
 892                          * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
 893                          * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
 894                          */
 895                         if (pdevice->rad_info.chip_class >= GFX10) {
 896                                 vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1;
 897                                 config_out->rsrc2 |= S_00B42C_LDS_SIZE_GFX10(info->tcs.num_lds_blocks) |
 898                                                      S_00B42C_EXCP_EN_GFX6(excp_en);
 899                         } else {
 900                                 vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
 901                                 config_out->rsrc2 |= S_00B42C_LDS_SIZE_GFX9(info->tcs.num_lds_blocks) |
 902                                                      S_00B42C_EXCP_EN_GFX9(excp_en);
 903                         }
 904                 } else {
 905                         config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) |
 906                                              S_00B12C_EXCP_EN(excp_en);
 907                 }
 908                 config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
 909                                      S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
 910                 config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
 911                 break;
 912         case MESA_SHADER_VERTEX:
 913                 if (info->is_ngg) {
 914                         config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
 915                 } else if (info->vs.as_ls) {
 916                         assert(pdevice->rad_info.chip_class <= GFX8);
 917                         /* We need at least 2 components for LS.
 918                          * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
 919                          * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
 920                          */
 921                         vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
 922                 } else if (info->vs.as_es) {
 923                         assert(pdevice->rad_info.chip_class <= GFX8);
 924                         /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
 925                         vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0;
 926                 } else {
 927                         /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
 928                          * If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
 929                          * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
 930                          */
 931                         if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) {
 932                                 vgpr_comp_cnt = 3;
 933                         } else if (info->vs.export_prim_id) {
 934                                 vgpr_comp_cnt = 2;
 935                         } else if (info->vs.needs_instance_id) {
 936                                 vgpr_comp_cnt = 1;
 937                         } else {
 938                                 vgpr_comp_cnt = 0;
 939                         }
 940
 941                         config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
 942                 }
 943                 config_out->rsrc2 |= S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |
 944                                      S_00B12C_EXCP_EN(excp_en);
 945                 break;
 946         case MESA_SHADER_FRAGMENT:
 947                 config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
 948                 config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |
 949                                      S_00B02C_TRAP_PRESENT(1) |
 950                                      S_00B02C_EXCP_EN(excp_en);
 951                 break;
 952         case MESA_SHADER_GEOMETRY:
 953                 config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
 954                                      S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
 955                 config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |
 956                                      S_00B22C_EXCP_EN(excp_en);
 957                 break;
 958         case MESA_SHADER_COMPUTE:
 959                 config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
 960                                      S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
 961                 config_out->rsrc2 |=
 962                         S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |
 963                         S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |
 964                         S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) |
 965                         S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 2 :
 966                                                 info->cs.uses_thread_id[1] ? 1 : 0) |
 967                         S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
 968                         S_00B84C_LDS_SIZE(config_in->lds_size) |
 969                         S_00B84C_EXCP_EN(excp_en);
 970                 config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
 971
 972                 break;
 973         default:
 974                 unreachable("unsupported shader type");
 975                 break;
 976         }
 977
 978         if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg &&
 979             (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_GEOMETRY)) {
 980                 unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
 981                 gl_shader_stage es_stage = stage;
 982                 if (stage == MESA_SHADER_GEOMETRY)
 983                         es_stage = info->gs.es_type;
 984
 985                 /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
 986                 if (es_stage == MESA_SHADER_VERTEX) {
 987                         es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0;
 988                 } else if (es_stage == MESA_SHADER_TESS_EVAL) {
 989                         bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
 990                         es_vgpr_comp_cnt = enable_prim_id ? 3 : 2;
 991                 } else
 992                         unreachable("Unexpected ES shader stage");
 993
 994                 bool tes_triangles = stage == MESA_SHADER_TESS_EVAL &&
 995                         info->tes.primitive_mode >= 4; /* GL_TRIANGLES */
 996                 if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) {
 997                         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
 998                 } else if (info->uses_prim_id) {
 999                         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1000                 } else if (info->gs.vertices_in >= 3 || tes_triangles) {
1001                         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1002                 } else {
1003                         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1004                 }
1005
1006                 config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) |
1007                                      S_00B228_WGP_MODE(1);
1008                 config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1009                                      S_00B22C_LDS_SIZE(config_in->lds_size) |
1010                                      S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL);
1011         } else if (pdevice->rad_info.chip_class >= GFX9 &&
1012                    stage == MESA_SHADER_GEOMETRY) {
1013                 unsigned es_type = info->gs.es_type;
1014                 unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
1015
1016                 if (es_type == MESA_SHADER_VERTEX) {
1017                         /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
1018                         if (info->vs.needs_instance_id) {
1019                                 es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1;
1020                         } else {
1021                                 es_vgpr_comp_cnt = 0;
1022                         }
1023                 } else if (es_type == MESA_SHADER_TESS_EVAL) {
1024                         es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
1025                 } else {
1026                         unreachable("invalid shader ES type");
1027                 }
1028
1029                 /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
1030                  * VGPR[0:4] are always loaded.
1031                  */
1032                 if (info->uses_invocation_id) {
1033                         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
1034                 } else if (info->uses_prim_id) {
1035                         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1036                 } else if (info->gs.vertices_in >= 3) {
1037                         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1038                 } else {
1039                         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1040                 }
1041
1042                 config_out->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
1043                 config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1044                                          S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);
1045         } else if (pdevice->rad_info.chip_class >= GFX9 &&
1046                    stage == MESA_SHADER_TESS_CTRL) {
1047                 config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
1048         } else {
1049                 config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
1050         }
1051 }
1052
1053 struct radv_shader_variant *
1054 radv_shader_variant_create(struct radv_device *device,
1055                            const struct radv_shader_binary *binary,
1056                            bool keep_shader_info)
1057 {
1058         struct ac_shader_config config = {0};
1059         struct ac_rtld_binary rtld_binary = {0};
1060         struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant));
1061         if (!variant)
1062                 return NULL;
1063
1064         variant->ref_count = 1;
1065
1066         if (binary->type == RADV_BINARY_TYPE_RTLD) {
1067                 struct ac_rtld_symbol lds_symbols[2];
1068                 unsigned num_lds_symbols = 0;
1069                 const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data;
1070                 size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size;
1071
1072                 if (device->physical_device->rad_info.chip_class >= GFX9 &&
1073                     (binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) &&
1074                     !binary->is_gs_copy_shader) {
1075                         /* We add this symbol even on LLVM <= 8 to ensure that
1076                          * shader->config.lds_size is set correctly below.
1077                          */
1078                         struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1079                         sym->name = "esgs_ring";
1080                         sym->size = binary->info.ngg_info.esgs_ring_size;
1081                         sym->align = 64 * 1024;
1082                 }
1083
1084                 if (binary->info.is_ngg &&
1085                     binary->stage == MESA_SHADER_GEOMETRY) {
1086                         struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1087                         sym->name = "ngg_emit";
1088                         sym->size = binary->info.ngg_info.ngg_emit_size * 4;
1089                         sym->align = 4;
1090                 }
1091
1092                 struct ac_rtld_open_info open_info = {
1093                         .info = &device->physical_device->rad_info,
1094                         .shader_type = binary->stage,
1095                         .wave_size = binary->info.wave_size,
1096                         .num_parts = 1,
1097                         .elf_ptrs = &elf_data,
1098                         .elf_sizes = &elf_size,
1099                         .num_shared_lds_symbols = num_lds_symbols,
1100                         .shared_lds_symbols = lds_symbols,
1101                 };
1102
1103                 if (!ac_rtld_open(&rtld_binary, open_info)) {
1104                         free(variant);
1105                         return NULL;
1106                 }
1107
1108                 if (!ac_rtld_read_config(&device->physical_device->rad_info,
1109                                          &rtld_binary, &config)) {
1110                         ac_rtld_close(&rtld_binary);
1111                         free(variant);
1112                         return NULL;
1113                 }
1114
1115                 if (rtld_binary.lds_size > 0) {
1116                         unsigned alloc_granularity = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256;
1117                         config.lds_size = align(rtld_binary.lds_size, alloc_granularity) / alloc_granularity;
1118                 }
1119
1120                 variant->code_size = rtld_binary.rx_size;
1121                 variant->exec_size = rtld_binary.exec_size;
1122         } else {
1123                 assert(binary->type == RADV_BINARY_TYPE_LEGACY);
1124                 config = ((struct radv_shader_binary_legacy *)binary)->config;
1125                 variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size);
1126                 variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size;
1127         }
1128
1129         variant->info = binary->info;
1130         radv_postprocess_config(device, &config, &binary->info,
1131                                 binary->stage, &variant->config);
1132
1133         void *dest_ptr = radv_alloc_shader_memory(device, variant);
1134         if (!dest_ptr) {
1135                 if (binary->type == RADV_BINARY_TYPE_RTLD)
1136                         ac_rtld_close(&rtld_binary);
1137                 free(variant);
1138                 return NULL;
1139         }
1140
1141         if (binary->type == RADV_BINARY_TYPE_RTLD) {
1142                 struct radv_shader_binary_rtld* bin = (struct radv_shader_binary_rtld *)binary;
1143                 struct ac_rtld_upload_info info = {
1144                         .binary = &rtld_binary,
1145                         .rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset,
1146                         .rx_ptr = dest_ptr,
1147                 };
1148
1149                 if (!ac_rtld_upload(&info)) {
1150                         radv_shader_variant_destroy(device, variant);
1151                         ac_rtld_close(&rtld_binary);
1152                         return NULL;
1153                 }
1154
1155                 if (keep_shader_info ||
1156                     (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) {
1157                         const char *disasm_data;
1158                         size_t disasm_size;
1159                         if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data, &disasm_size)) {
1160                                 radv_shader_variant_destroy(device, variant);
1161                                 ac_rtld_close(&rtld_binary);
1162                                 return NULL;
1163                         }
1164
1165                         variant->ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->elf_size)) : NULL;
1166                         variant->disasm_string = malloc(disasm_size + 1);
1167                         memcpy(variant->disasm_string, disasm_data, disasm_size);
1168                         variant->disasm_string[disasm_size] = 0;
1169                 }
1170
1171                 ac_rtld_close(&rtld_binary);
1172         } else {
1173                 struct radv_shader_binary_legacy* bin = (struct radv_shader_binary_legacy *)binary;
1174                 memcpy(dest_ptr, bin->data + bin->stats_size, bin->code_size);
1175
1176                 /* Add end-of-code markers for the UMR disassembler. */
1177                 uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4;
1178                 for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
1179                         ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
1180
1181                 variant->ir_string = bin->ir_size ? strdup((const char*)(bin->data + bin->stats_size + bin->code_size)) : NULL;
1182                 variant->disasm_string = bin->disasm_size ? strdup((const char*)(bin->data + bin->stats_size + bin->code_size + bin->ir_size)) : NULL;
1183
1184                 if (bin->stats_size) {
1185                         variant->statistics = calloc(bin->stats_size, 1);
1186                         memcpy(variant->statistics, bin->data, bin->stats_size);
1187                 }
1188         }
1189         return variant;
1190 }
1191
1192 static char *
1193 radv_dump_nir_shaders(struct nir_shader * const *shaders,
1194                       int shader_count)
1195 {
1196         char *data = NULL;
1197         char *ret = NULL;
1198         size_t size = 0;
1199         FILE *f = open_memstream(&data, &size);
1200         if (f) {
1201                 for (int i = 0; i < shader_count; ++i)
1202                         nir_print_shader(shaders[i], f);
1203                 fclose(f);
1204         }
1205
1206         ret = malloc(size + 1);
1207         if (ret) {
1208                 memcpy(ret, data, size);
1209                 ret[size] = 0;
1210         }
1211         free(data);
1212         return ret;
1213 }
1214
1215 static struct radv_shader_variant *
1216 shader_variant_compile(struct radv_device *device,
1217                        struct radv_shader_module *module,
1218                        struct nir_shader * const *shaders,
1219                        int shader_count,
1220                        gl_shader_stage stage,
1221                        struct radv_shader_info *info,
1222                        struct radv_nir_compiler_options *options,
1223                        bool gs_copy_shader,
1224                        bool trap_handler_shader,
1225                        bool keep_shader_info,
1226                        bool keep_statistic_info,
1227                        struct radv_shader_binary **binary_out)
1228 {
1229         enum radeon_family chip_family = device->physical_device->rad_info.family;
1230         struct radv_shader_binary *binary = NULL;
1231
1232         struct radv_shader_debug_data debug_data = {
1233                 .device = device,
1234                 .module = module,
1235         };
1236
1237         options->family = chip_family;
1238         options->chip_class = device->physical_device->rad_info.chip_class;
1239         options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader);
1240         options->dump_preoptir = options->dump_shader &&
1241                                  device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
1242         options->record_ir = keep_shader_info;
1243         options->record_stats = keep_statistic_info;
1244         options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
1245         options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
1246         options->address32_hi = device->physical_device->rad_info.address32_hi;
1247         options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug;
1248         options->use_ngg_streamout = device->physical_device->use_ngg_streamout;
1249         options->enable_mrt_output_nan_fixup = device->instance->enable_mrt_output_nan_fixup;
1250         options->debug.func = radv_compiler_debug;
1251         options->debug.private_data = &debug_data;
1252
1253         struct radv_shader_args args = {};
1254         args.options = options;
1255         args.shader_info = info;
1256         args.is_gs_copy_shader = gs_copy_shader;
1257         args.is_trap_handler_shader = trap_handler_shader;
1258
1259         radv_declare_shader_args(&args,
1260                                  gs_copy_shader ? MESA_SHADER_VERTEX
1261                                                 : shaders[shader_count - 1]->info.stage,
1262                                  shader_count >= 2,
1263                                  shader_count >= 2 ? shaders[shader_count - 2]->info.stage
1264                                                    : MESA_SHADER_VERTEX);
1265
1266         if (radv_use_llvm_for_stage(device, stage) ||
1267             options->dump_shader || options->record_ir)
1268                 ac_init_llvm_once();
1269
1270         if (radv_use_llvm_for_stage(device, stage)) {
1271                 llvm_compile_shader(device, shader_count, shaders, &binary, &args);
1272         } else {
1273                 aco_compile_shader(shader_count, shaders, &binary, &args);
1274         }
1275
1276         binary->info = *info;
1277
1278         struct radv_shader_variant *variant = radv_shader_variant_create(device, binary,
1279                                                                          keep_shader_info);
1280         if (!variant) {
1281                 free(binary);
1282                 return NULL;
1283         }
1284
1285         if (options->dump_shader) {
1286                 fprintf(stderr, "%s", radv_get_shader_name(info, shaders[0]->info.stage));
1287                 for (int i = 1; i < shader_count; ++i)
1288                         fprintf(stderr, " + %s", radv_get_shader_name(info, shaders[i]->info.stage));
1289
1290                 fprintf(stderr, "\ndisasm:\n%s\n", variant->disasm_string);
1291         }
1292
1293
1294         if (keep_shader_info) {
1295                 variant->nir_string = radv_dump_nir_shaders(shaders, shader_count);
1296                 if (!gs_copy_shader && !trap_handler_shader && !module->nir) {
1297                         variant->spirv = malloc(module->size);
1298                         if (!variant->spirv) {
1299                                 free(variant);
1300                                 free(binary);
1301                                 return NULL;
1302                         }
1303
1304                         memcpy(variant->spirv, module->data, module->size);
1305                         variant->spirv_size = module->size;
1306                 }
1307         }
1308
1309         if (binary_out)
1310                 *binary_out = binary;
1311         else
1312                 free(binary);
1313
1314         return variant;
1315 }
1316
1317 struct radv_shader_variant *
1318 radv_shader_variant_compile(struct radv_device *device,
1319                            struct radv_shader_module *module,
1320                            struct nir_shader *const *shaders,
1321                            int shader_count,
1322                            struct radv_pipeline_layout *layout,
1323                            const struct radv_shader_variant_key *key,
1324                            struct radv_shader_info *info,
1325                            bool keep_shader_info, bool keep_statistic_info,
1326                            bool disable_optimizations,
1327                            struct radv_shader_binary **binary_out)
1328 {
1329         gl_shader_stage stage =  shaders[shader_count - 1]->info.stage;
1330         struct radv_nir_compiler_options options = {0};
1331
1332         options.layout = layout;
1333         if (key)
1334                 options.key = *key;
1335
1336         options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
1337         options.robust_buffer_access = device->robust_buffer_access;
1338         options.disable_optimizations = disable_optimizations;
1339
1340         return shader_variant_compile(device, module, shaders, shader_count, stage, info,
1341                                       &options, false, false,
1342                                       keep_shader_info, keep_statistic_info, binary_out);
1343 }
1344
1345 struct radv_shader_variant *
1346 radv_create_gs_copy_shader(struct radv_device *device,
1347                            struct nir_shader *shader,
1348                            struct radv_shader_info *info,
1349                            struct radv_shader_binary **binary_out,
1350                            bool keep_shader_info, bool keep_statistic_info,
1351                            bool multiview, bool disable_optimizations)
1352 {
1353         struct radv_nir_compiler_options options = {0};
1354         gl_shader_stage stage = MESA_SHADER_VERTEX;
1355
1356         options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
1357         options.key.has_multiview_view_index = multiview;
1358         options.disable_optimizations = disable_optimizations;
1359
1360         return shader_variant_compile(device, NULL, &shader, 1, stage,
1361                                       info, &options, true, false,
1362                                       keep_shader_info, keep_statistic_info, binary_out);
1363 }
1364
1365 struct radv_shader_variant *
1366 radv_create_trap_handler_shader(struct radv_device *device)
1367 {
1368         struct radv_nir_compiler_options options = {0};
1369         struct radv_shader_variant *shader = NULL;
1370         struct radv_shader_binary *binary = NULL;
1371         struct radv_shader_info info = {0};
1372
1373         nir_builder b;
1374         nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
1375         b.shader->info.name = ralloc_strdup(b.shader, "meta_trap_handler");
1376
1377         options.explicit_scratch_args = true;
1378         info.wave_size = 64;
1379
1380         shader = shader_variant_compile(device, NULL, &b.shader, 1,
1381                                         MESA_SHADER_COMPUTE, &info, &options,
1382                                         false, true, true, false, &binary);
1383
1384         ralloc_free(b.shader);
1385         free(binary);
1386
1387         return shader;
1388 }
1389
1390 void
1391 radv_shader_variant_destroy(struct radv_device *device,
1392                             struct radv_shader_variant *variant)
1393 {
1394         if (!p_atomic_dec_zero(&variant->ref_count))
1395                 return;
1396
1397         mtx_lock(&device->shader_slab_mutex);
1398         list_del(&variant->slab_list);
1399         mtx_unlock(&device->shader_slab_mutex);
1400
1401         free(variant->spirv);
1402         free(variant->nir_string);
1403         free(variant->disasm_string);
1404         free(variant->ir_string);
1405         free(variant->statistics);
1406         free(variant);
1407 }
1408
1409 const char *
1410 radv_get_shader_name(struct radv_shader_info *info,
1411                      gl_shader_stage stage)
1412 {
1413         switch (stage) {
1414         case MESA_SHADER_VERTEX:
1415                 if (info->vs.as_ls)
1416                         return "Vertex Shader as LS";
1417                 else if (info->vs.as_es)
1418                         return "Vertex Shader as ES";
1419                 else if (info->is_ngg)
1420                         return "Vertex Shader as ESGS";
1421                 else
1422                         return "Vertex Shader as VS";
1423         case MESA_SHADER_TESS_CTRL:
1424                 return "Tessellation Control Shader";
1425         case MESA_SHADER_TESS_EVAL:
1426                 if (info->tes.as_es)
1427                         return "Tessellation Evaluation Shader as ES";
1428                 else if (info->is_ngg)
1429                         return "Tessellation Evaluation Shader as ESGS";
1430                 else
1431                         return "Tessellation Evaluation Shader as VS";
1432         case MESA_SHADER_GEOMETRY:
1433                 return "Geometry Shader";
1434         case MESA_SHADER_FRAGMENT:
1435                 return "Pixel Shader";
1436         case MESA_SHADER_COMPUTE:
1437                 return "Compute Shader";
1438         default:
1439                 return "Unknown shader";
1440         };
1441 }
1442
1443 unsigned
1444 radv_get_max_workgroup_size(enum chip_class chip_class,
1445                             gl_shader_stage stage,
1446                             const unsigned *sizes)
1447 {
1448         switch (stage) {
1449         case MESA_SHADER_TESS_CTRL:
1450                 return chip_class >= GFX7 ? 128 : 64;
1451         case MESA_SHADER_GEOMETRY:
1452                 return chip_class >= GFX9 ? 128 : 64;
1453         case MESA_SHADER_COMPUTE:
1454                 break;
1455         default:
1456                 return 0;
1457         }
1458
1459         unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2];
1460         return max_workgroup_size;
1461 }
1462
1463 unsigned
1464 radv_get_max_waves(struct radv_device *device,
1465                    struct radv_shader_variant *variant,
1466                    gl_shader_stage stage)
1467 {
1468         enum chip_class chip_class = device->physical_device->rad_info.chip_class;
1469         unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
1470         uint8_t wave_size = variant->info.wave_size;
1471         struct ac_shader_config *conf = &variant->config;
1472         unsigned max_simd_waves;
1473         unsigned lds_per_wave = 0;
1474
1475         max_simd_waves = device->physical_device->rad_info.max_wave64_per_simd;
1476
1477         if (stage == MESA_SHADER_FRAGMENT) {
1478                 lds_per_wave = conf->lds_size * lds_increment +
1479                                align(variant->info.ps.num_interp * 48,
1480                                      lds_increment);
1481         } else if (stage == MESA_SHADER_COMPUTE) {
1482                 unsigned max_workgroup_size =
1483                         radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size);
1484                 lds_per_wave = (conf->lds_size * lds_increment) /
1485                                DIV_ROUND_UP(max_workgroup_size, wave_size);
1486         }
1487
1488         if (conf->num_sgprs) {
1489                 unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8);
1490                 max_simd_waves =
1491                         MIN2(max_simd_waves,
1492                              device->physical_device->rad_info.num_physical_sgprs_per_simd /
1493                              sgprs);
1494         }
1495
1496         if (conf->num_vgprs) {
1497                 unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4);
1498                 max_simd_waves =
1499                         MIN2(max_simd_waves,
1500                              device->physical_device->rad_info.num_physical_wave64_vgprs_per_simd / vgprs);
1501         }
1502
1503         unsigned max_lds_per_simd = device->physical_device->rad_info.lds_size_per_workgroup / device->physical_device->rad_info.num_simd_per_compute_unit;
1504         if (lds_per_wave)
1505                 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1506
1507         return max_simd_waves;
1508 }
1509
1510 VkResult
1511 radv_GetShaderInfoAMD(VkDevice _device,
1512                       VkPipeline _pipeline,
1513                       VkShaderStageFlagBits shaderStage,
1514                       VkShaderInfoTypeAMD infoType,
1515                       size_t* pInfoSize,
1516                       void* pInfo)
1517 {
1518         RADV_FROM_HANDLE(radv_device, device, _device);
1519         RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
1520         gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage);
1521         struct radv_shader_variant *variant = pipeline->shaders[stage];
1522         VkResult result = VK_SUCCESS;
1523
1524         /* Spec doesn't indicate what to do if the stage is invalid, so just
1525          * return no info for this. */
1526         if (!variant)
1527                 return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
1528
1529         switch (infoType) {
1530         case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
1531                 if (!pInfo) {
1532                         *pInfoSize = sizeof(VkShaderStatisticsInfoAMD);
1533                 } else {
1534                         unsigned lds_multiplier = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256;
1535                         struct ac_shader_config *conf = &variant->config;
1536
1537                         VkShaderStatisticsInfoAMD statistics = {};
1538                         statistics.shaderStageMask = shaderStage;
1539                         statistics.numPhysicalVgprs = device->physical_device->rad_info.num_physical_wave64_vgprs_per_simd;
1540                         statistics.numPhysicalSgprs = device->physical_device->rad_info.num_physical_sgprs_per_simd;
1541                         statistics.numAvailableSgprs = statistics.numPhysicalSgprs;
1542
1543                         if (stage == MESA_SHADER_COMPUTE) {
1544                                 unsigned *local_size = variant->info.cs.block_size;
1545                                 unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];
1546
1547                                 statistics.numAvailableVgprs = statistics.numPhysicalVgprs /
1548                                                                ceil((double)workgroup_size / statistics.numPhysicalVgprs);
1549
1550                                 statistics.computeWorkGroupSize[0] = local_size[0];
1551                                 statistics.computeWorkGroupSize[1] = local_size[1];
1552                                 statistics.computeWorkGroupSize[2] = local_size[2];
1553                         } else {
1554                                 statistics.numAvailableVgprs = statistics.numPhysicalVgprs;
1555                         }
1556
1557                         statistics.resourceUsage.numUsedVgprs = conf->num_vgprs;
1558                         statistics.resourceUsage.numUsedSgprs = conf->num_sgprs;
1559                         statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768;
1560                         statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier;
1561                         statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave;
1562
1563                         size_t size = *pInfoSize;
1564                         *pInfoSize = sizeof(statistics);
1565
1566                         memcpy(pInfo, &statistics, MIN2(size, *pInfoSize));
1567
1568                         if (size < *pInfoSize)
1569                                 result = VK_INCOMPLETE;
1570                 }
1571
1572                 break;
1573         case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD: {
1574                 char *out;
1575                 size_t outsize;
1576                 FILE *memf = open_memstream(&out, &outsize);
1577
1578                 fprintf(memf, "%s:\n", radv_get_shader_name(&variant->info, stage));
1579                 fprintf(memf, "%s\n\n", variant->ir_string);
1580                 fprintf(memf, "%s\n\n", variant->disasm_string);
1581                 radv_dump_shader_stats(device, pipeline, stage, memf);
1582                 fclose(memf);
1583
1584                 /* Need to include the null terminator. */
1585                 size_t length = outsize + 1;
1586
1587                 if (!pInfo) {
1588                         *pInfoSize = length;
1589                 } else {
1590                         size_t size = *pInfoSize;
1591                         *pInfoSize = length;
1592
1593                         memcpy(pInfo, out, MIN2(size, length));
1594
1595                         if (size < length)
1596                                 result = VK_INCOMPLETE;
1597                 }
1598
1599                 free(out);
1600                 break;
1601         }
1602         default:
1603                 /* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */
1604                 result = VK_ERROR_FEATURE_NOT_PRESENT;
1605                 break;
1606         }
1607
1608         return result;
1609 }
1610
1611 VkResult
1612 radv_dump_shader_stats(struct radv_device *device,
1613                        struct radv_pipeline *pipeline,
1614                        gl_shader_stage stage, FILE *output)
1615 {
1616         struct radv_shader_variant *shader = pipeline->shaders[stage];
1617         VkPipelineExecutablePropertiesKHR *props = NULL;
1618         uint32_t prop_count = 0;
1619         VkResult result;
1620
1621         VkPipelineInfoKHR pipeline_info = {};
1622         pipeline_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR;
1623         pipeline_info.pipeline = radv_pipeline_to_handle(pipeline);
1624
1625         result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device),
1626                                                          &pipeline_info,
1627                                                          &prop_count, NULL);
1628         if (result != VK_SUCCESS)
1629                 return result;
1630
1631         props = calloc(prop_count, sizeof(*props));
1632         if (!props)
1633                 return VK_ERROR_OUT_OF_HOST_MEMORY;
1634
1635         result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device),
1636                                                          &pipeline_info,
1637                                                          &prop_count, props);
1638         if (result != VK_SUCCESS)
1639                 goto fail;
1640
1641         for (unsigned i = 0; i < prop_count; i++) {
1642                 if (!(props[i].stages & mesa_to_vk_shader_stage(stage)))
1643                         continue;
1644
1645                 VkPipelineExecutableStatisticKHR *stats = NULL;
1646                 uint32_t stat_count = 0;
1647                 VkResult result;
1648
1649                 VkPipelineExecutableInfoKHR exec_info = {};
1650                 exec_info.pipeline = radv_pipeline_to_handle(pipeline);
1651                 exec_info.executableIndex = i;
1652
1653                 result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device),
1654                                                                  &exec_info,
1655                                                                  &stat_count, NULL);
1656                 if (result != VK_SUCCESS)
1657                         goto fail;
1658
1659                 stats = calloc(stat_count, sizeof(*stats));
1660                 if (!stats) {
1661                         result = VK_ERROR_OUT_OF_HOST_MEMORY;
1662                         goto fail;
1663                 }
1664
1665                 result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device),
1666                                                                  &exec_info,
1667                                                                  &stat_count, stats);
1668                 if (result != VK_SUCCESS) {
1669                         free(stats);
1670                         goto fail;
1671                 }
1672
1673                 fprintf(output, "\n%s:\n",
1674                         radv_get_shader_name(&shader->info, stage));
1675                 fprintf(output, "*** SHADER STATS ***\n");
1676
1677                 for (unsigned i = 0; i < stat_count; i++) {
1678                         fprintf(output, "%s: ", stats[i].name);
1679                         switch (stats[i].format) {
1680                         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:
1681                                 fprintf(output, "%s", stats[i].value.b32 == VK_TRUE ? "true" : "false");
1682                                 break;
1683                         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
1684                                 fprintf(output, "%"PRIi64, stats[i].value.i64);
1685                                 break;
1686                         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
1687                                 fprintf(output, "%"PRIu64, stats[i].value.u64);
1688                                 break;
1689                         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
1690                                 fprintf(output, "%f", stats[i].value.f64);
1691                                 break;
1692                         default:
1693                                 unreachable("Invalid pipeline statistic format");
1694                         }
1695                         fprintf(output, "\n");
1696                 }
1697
1698                 fprintf(output, "********************\n\n\n");
1699
1700                 free(stats);
1701         }
1702
1703 fail:
1704         free(props);
1705         return result;
1706 }