X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_shader.c;h=729aabaf2729fb78eb81cc78ae91243df79ca87f;hb=71a67942003a96d90289f7f53f546af821e64a51;hp=0c3e375ee5e0fb1f9aaa1f26319538eca89b28cb;hpb=a69ab1b7d2a7b34188f38c5e130abd264518b8ad;p=mesa.git diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 0c3e375ee5e..729aabaf272 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -80,6 +80,50 @@ static const struct nir_shader_compiler_options nir_options = { .use_interpolated_input_intrinsics = true, }; +bool +radv_can_dump_shader(struct radv_device *device, + struct radv_shader_module *module, + bool is_gs_copy_shader) +{ + if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) + return false; + + /* Only dump non-meta shaders, useful for debugging purposes. */ + return (module && !module->nir) || is_gs_copy_shader; +} + +bool +radv_can_dump_shader_stats(struct radv_device *device, + struct radv_shader_module *module) +{ + /* Only dump non-meta shader stats. */ + return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS && + module && !module->nir; +} + +unsigned shader_io_get_unique_index(gl_varying_slot slot) +{ + /* handle patch indices separate */ + if (slot == VARYING_SLOT_TESS_LEVEL_OUTER) + return 0; + if (slot == VARYING_SLOT_TESS_LEVEL_INNER) + return 1; + if (slot >= VARYING_SLOT_PATCH0 && slot <= VARYING_SLOT_TESS_MAX) + return 2 + (slot - VARYING_SLOT_PATCH0); + if (slot == VARYING_SLOT_POS) + return 0; + if (slot == VARYING_SLOT_PSIZ) + return 1; + if (slot == VARYING_SLOT_CLIP_DIST0) + return 2; + if (slot == VARYING_SLOT_CLIP_DIST1) + return 3; + /* 3 is reserved for clip dist as well */ + if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31) + return 4 + (slot - VARYING_SLOT_VAR0); + unreachable("illegal slot in get unique index\n"); +} + VkResult radv_CreateShaderModule( VkDevice _device, const VkShaderModuleCreateInfo* pCreateInfo, @@ -203,7 +247,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, NIR_PASS(progress, shader, nir_opt_conditional_discard); NIR_PASS(progress, shader, nir_opt_shrink_load); - NIR_PASS(progress, shader, nir_opt_move_load_ubo); + NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo); } nir_shader * @@ -253,7 +297,7 @@ radv_shader_compile_to_nir(struct radv_device *device, .lower_ubo_ssbo_access_to_offsets = true, .caps = { .amd_gcn_shader = true, - .amd_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT, + .amd_shader_ballot = device->physical_device->use_shader_ballot, .amd_trinary_minmax = true, .derivative_group = true, .descriptor_array_dynamic_indexing = true, @@ -398,6 +442,13 @@ radv_shader_compile_to_nir(struct radv_device *device, */ nir_lower_var_copies(nir); + /* Lower large variables that are always constant with load_constant + * intrinsics, which get turned into PC-relative loads from a data + * section next to the shader. + */ + NIR_PASS_V(nir, nir_opt_large_constants, + glsl_get_natural_size_align_bytes, 16); + /* Indirect lowering must be called after the radv_optimize_nir() loop * has been called at least once. Otherwise indirect lowering can * bloat the instruction count of the loop and cause it to be @@ -409,53 +460,6 @@ radv_shader_compile_to_nir(struct radv_device *device, return nir; } -static void mark_16bit_fs_input(struct radv_shader_variant_info *shader_info, - const struct glsl_type *type, - int location) -{ - if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { - unsigned attrib_count = glsl_count_attribute_slots(type, false); - if (glsl_type_is_16bit(type)) { - shader_info->fs.float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; - } - } else if (glsl_type_is_array(type)) { - unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); - for (unsigned i = 0; i < glsl_get_length(type); ++i) { - mark_16bit_fs_input(shader_info, glsl_get_array_element(type), location + i * stride); - } - } else { - assert(glsl_type_is_struct_or_ifc(type)); - for (unsigned i = 0; i < glsl_get_length(type); i++) { - mark_16bit_fs_input(shader_info, glsl_get_struct_field(type, i), location); - location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); - } - } -} - -static void -handle_fs_input_decl(struct radv_shader_variant_info *shader_info, - struct nir_variable *variable) -{ - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - - if (variable->data.compact) { - unsigned component_count = variable->data.location_frac + - glsl_get_length(variable->type); - attrib_count = (component_count + 3) / 4; - } else { - mark_16bit_fs_input(shader_info, variable->type, - variable->data.driver_location); - } - - uint64_t mask = ((1ull << attrib_count) - 1); - - if (variable->data.interpolation == INTERP_MODE_FLAT) - shader_info->fs.flat_shaded_mask |= mask << variable->data.driver_location; - - if (variable->data.location >= VARYING_SLOT_VAR0) - shader_info->fs.input_mask |= mask << (variable->data.location - VARYING_SLOT_VAR0); -} - static int type_size_vec4(const struct glsl_type *type, bool bindless) { @@ -523,28 +527,13 @@ lower_view_index(nir_shader *nir) return progress; } -/* Gather information needed to setup the vs<->ps linking registers in - * radv_pipeline_generate_ps_inputs(). - */ - -static void -handle_fs_inputs(nir_shader *nir, struct radv_shader_variant_info *shader_info) -{ - shader_info->fs.num_interp = nir->num_inputs; - - nir_foreach_variable(variable, &nir->inputs) - handle_fs_input_decl(shader_info, variable); -} - -static void -lower_fs_io(nir_shader *nir, struct radv_shader_variant_info *shader_info) +void +radv_lower_fs_io(nir_shader *nir) { NIR_PASS_V(nir, lower_view_index); nir_assign_io_var_locations(&nir->inputs, &nir->num_inputs, MESA_SHADER_FRAGMENT); - handle_fs_inputs(nir, shader_info); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); /* This pass needs actual constants */ @@ -587,7 +576,7 @@ radv_alloc_shader_memory(struct radv_device *device, slab->bo = device->ws->buffer_create(device->ws, slab->size, 256, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING | - (device->physical_device->cpdma_prefetch_writes_memory ? + (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0 : RADEON_FLAG_READ_ONLY), RADV_BO_PRIORITY_SHADER); slab->ptr = (char*)device->ws->buffer_map(slab->bo); @@ -699,7 +688,8 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) | S_00B12C_SO_EN(!!info->info.so.num_outputs); - config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / 4) | + config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / + (info->info.wave_size == 32 ? 8 : 4)) | S_00B848_DX10_CLAMP(1) | S_00B848_FLOAT_MODE(config_out->float_mode); @@ -764,10 +754,12 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, * If PrimID is disabled. InstanceID / StepRate1 is loaded instead. * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. */ - if (info->vs.export_prim_id) { + if (info->info.vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) { + vgpr_comp_cnt = 3; + } else if (info->vs.export_prim_id) { vgpr_comp_cnt = 2; } else if (info->info.vs.needs_instance_id) { - vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1; + vgpr_comp_cnt = 1; } else { vgpr_comp_cnt = 0; } @@ -874,48 +866,10 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, } } -static void radv_init_llvm_target() -{ - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); - - /* For inline assembly. */ - LLVMInitializeAMDGPUAsmParser(); - - /* Workaround for bug in llvm 4.0 that causes image intrinsics - * to disappear. - * https://reviews.llvm.org/D26348 - * - * Workaround for bug in llvm that causes the GPU to hang in presence - * of nested loops because there is an exec mask issue. The proper - * solution is to fix LLVM but this might require a bunch of work. - * https://bugs.llvm.org/show_bug.cgi?id=37744 - * - * "mesa" is the prefix for error messages. - */ - if (HAVE_LLVM >= 0x0800) { - const char *argv[2] = { "mesa", "-simplifycfg-sink-common=false" }; - LLVMParseCommandLineOptions(2, argv, NULL); - - } else { - const char *argv[3] = { "mesa", "-simplifycfg-sink-common=false", - "-amdgpu-skip-threshold=1" }; - LLVMParseCommandLineOptions(3, argv, NULL); - } -} - -static once_flag radv_init_llvm_target_once_flag = ONCE_FLAG_INIT; - -static void radv_init_llvm_once(void) -{ - call_once(&radv_init_llvm_target_once_flag, radv_init_llvm_target); -} - struct radv_shader_variant * radv_shader_variant_create(struct radv_device *device, - const struct radv_shader_binary *binary) + const struct radv_shader_binary *binary, + bool keep_shader_info) { struct ac_shader_config config = {0}; struct ac_rtld_binary rtld_binary = {0}; @@ -965,10 +919,11 @@ radv_shader_variant_create(struct radv_device *device, if (binary->variant_info.is_ngg) sym->size -= 32; } + struct ac_rtld_open_info open_info = { .info = &device->physical_device->rad_info, .shader_type = binary->stage, - .wave_size = 64, + .wave_size = binary->variant_info.info.wave_size, .num_parts = 1, .elf_ptrs = &elf_data, .elf_sizes = &elf_size, @@ -993,10 +948,12 @@ radv_shader_variant_create(struct radv_device *device, } variant->code_size = rtld_binary.rx_size; + variant->exec_size = rtld_binary.exec_size; } else { assert(binary->type == RADV_BINARY_TYPE_LEGACY); config = ((struct radv_shader_binary_legacy *)binary)->config; - variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size); + variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size); + variant->exec_size = variant->code_size; } variant->info = binary->variant_info; @@ -1019,7 +976,7 @@ radv_shader_variant_create(struct radv_device *device, return NULL; } - if (device->keep_shader_info || + if (keep_shader_info || (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) { const char *disasm_data; size_t disasm_size; @@ -1051,6 +1008,29 @@ radv_shader_variant_create(struct radv_device *device, return variant; } +static char * +radv_dump_nir_shaders(struct nir_shader * const *shaders, + int shader_count) +{ + char *data = NULL; + char *ret = NULL; + size_t size = 0; + FILE *f = open_memstream(&data, &size); + if (f) { + for (int i = 0; i < shader_count; ++i) + nir_print_shader(shaders[i], f); + fclose(f); + } + + ret = malloc(size + 1); + if (ret) { + memcpy(ret, data, size); + ret[size] = 0; + } + free(data); + return ret; +} + static struct radv_shader_variant * shader_variant_compile(struct radv_device *device, struct radv_shader_module *module, @@ -1059,6 +1039,7 @@ shader_variant_compile(struct radv_device *device, gl_shader_stage stage, struct radv_nir_compiler_options *options, bool gs_copy_shader, + bool keep_shader_info, struct radv_shader_binary **binary_out) { enum radeon_family chip_family = device->physical_device->rad_info.family; @@ -1068,18 +1049,26 @@ shader_variant_compile(struct radv_device *device, struct radv_shader_variant_info variant_info = {0}; bool thread_compiler; - if (shaders[0]->info.stage == MESA_SHADER_FRAGMENT) - lower_fs_io(shaders[0], &variant_info); - options->family = chip_family; options->chip_class = device->physical_device->rad_info.chip_class; options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader); options->dump_preoptir = options->dump_shader && device->instance->debug_flags & RADV_DEBUG_PREOPTIR; - options->record_llvm_ir = device->keep_shader_info; + options->record_llvm_ir = keep_shader_info; options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR; options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size; options->address32_hi = device->physical_device->rad_info.address32_hi; + options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug; + + if ((stage == MESA_SHADER_GEOMETRY && !options->key.vs_common_out.as_ngg) || + gs_copy_shader) + options->wave_size = 64; + else if (stage == MESA_SHADER_COMPUTE) + options->wave_size = device->physical_device->cs_wave_size; + else if (stage == MESA_SHADER_FRAGMENT) + options->wave_size = device->physical_device->ps_wave_size; + else + options->wave_size = device->physical_device->ge_wave_size; if (options->supports_spill) tm_options |= AC_TM_SUPPORTS_SPILL; @@ -1091,10 +1080,11 @@ shader_variant_compile(struct radv_device *device, tm_options |= AC_TM_NO_LOAD_STORE_OPT; thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); - radv_init_llvm_once(); + ac_init_llvm_once(); radv_init_llvm_compiler(&ac_llvm, thread_compiler, - chip_family, tm_options); + chip_family, tm_options, + options->wave_size); if (gs_copy_shader) { assert(shader_count == 1); radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary, @@ -1107,7 +1097,8 @@ shader_variant_compile(struct radv_device *device, radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); - struct radv_shader_variant *variant = radv_shader_variant_create(device, binary); + struct radv_shader_variant *variant = radv_shader_variant_create(device, binary, + keep_shader_info); if (!variant) { free(binary); return NULL; @@ -1118,9 +1109,9 @@ shader_variant_compile(struct radv_device *device, } - if (device->keep_shader_info) { + if (keep_shader_info) { + variant->nir_string = radv_dump_nir_shaders(shaders, shader_count); if (!gs_copy_shader && !module->nir) { - variant->nir = *shaders; variant->spirv = (uint32_t *)module->data; variant->spirv_size = module->size; } @@ -1141,6 +1132,7 @@ radv_shader_variant_compile(struct radv_device *device, int shader_count, struct radv_pipeline_layout *layout, const struct radv_shader_variant_key *key, + bool keep_shader_info, struct radv_shader_binary **binary_out) { struct radv_nir_compiler_options options = {0}; @@ -1151,15 +1143,17 @@ radv_shader_variant_compile(struct radv_device *device, options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH); options.supports_spill = true; + options.robust_buffer_access = device->robust_buffer_access; return shader_variant_compile(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, - &options, false, binary_out); + &options, false, keep_shader_info, binary_out); } struct radv_shader_variant * radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader, struct radv_shader_binary **binary_out, + bool keep_shader_info, bool multiview) { struct radv_nir_compiler_options options = {0}; @@ -1167,7 +1161,7 @@ radv_create_gs_copy_shader(struct radv_device *device, options.key.has_multiview_view_index = multiview; return shader_variant_compile(device, NULL, &shader, 1, MESA_SHADER_VERTEX, - &options, true, binary_out); + &options, true, keep_shader_info, binary_out); } void @@ -1181,7 +1175,7 @@ radv_shader_variant_destroy(struct radv_device *device, list_del(&variant->slab_list); mtx_unlock(&device->shader_slab_mutex); - ralloc_free(variant->nir); + free(variant->nir_string); free(variant->disasm_string); free(variant->llvm_ir_string); free(variant); @@ -1221,31 +1215,49 @@ radv_get_shader_name(struct radv_shader_variant_info *info, }; } -static void -generate_shader_stats(struct radv_device *device, - struct radv_shader_variant *variant, - gl_shader_stage stage, - struct _mesa_string_buffer *buf) +unsigned +radv_get_max_workgroup_size(enum chip_class chip_class, + gl_shader_stage stage, + const unsigned *sizes) +{ + switch (stage) { + case MESA_SHADER_TESS_CTRL: + return chip_class >= GFX7 ? 128 : 64; + case MESA_SHADER_GEOMETRY: + return chip_class >= GFX9 ? 128 : 64; + case MESA_SHADER_COMPUTE: + break; + default: + return 0; + } + + unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2]; + return max_workgroup_size; +} + +unsigned +radv_get_max_waves(struct radv_device *device, + struct radv_shader_variant *variant, + gl_shader_stage stage) { enum chip_class chip_class = device->physical_device->rad_info.chip_class; unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; - struct ac_shader_config *conf; + uint8_t wave_size = variant->info.info.wave_size; + struct ac_shader_config *conf = &variant->config; unsigned max_simd_waves; unsigned lds_per_wave = 0; max_simd_waves = ac_get_max_simd_waves(device->physical_device->rad_info.family); - conf = &variant->config; - if (stage == MESA_SHADER_FRAGMENT) { lds_per_wave = conf->lds_size * lds_increment + - align(variant->info.fs.num_interp * 48, + align(variant->info.info.ps.num_interp * 48, lds_increment); } else if (stage == MESA_SHADER_COMPUTE) { unsigned max_workgroup_size = - radv_nir_get_max_workgroup_size(chip_class, stage, variant->nir); + radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size); lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, 64); + DIV_ROUND_UP(max_workgroup_size, wave_size); } if (conf->num_sgprs) @@ -1264,6 +1276,18 @@ generate_shader_stats(struct radv_device *device, if (lds_per_wave) max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); + return max_simd_waves; +} + +static void +generate_shader_stats(struct radv_device *device, + struct radv_shader_variant *variant, + gl_shader_stage stage, + struct _mesa_string_buffer *buf) +{ + struct ac_shader_config *conf = &variant->config; + unsigned max_simd_waves = radv_get_max_waves(device, variant, stage); + if (stage == MESA_SHADER_FRAGMENT) { _mesa_string_buffer_printf(buf, "*** SHADER CONFIG ***\n" "SPI_PS_INPUT_ADDR = 0x%04x\n" @@ -1284,7 +1308,7 @@ generate_shader_stats(struct radv_device *device, "********************\n\n\n", conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, - variant->info.private_mem_vgprs, variant->code_size, + variant->info.private_mem_vgprs, variant->exec_size, conf->lds_size, conf->scratch_bytes_per_wave, max_simd_waves); } @@ -1340,7 +1364,7 @@ radv_GetShaderInfoAMD(VkDevice _device, statistics.numAvailableSgprs = statistics.numPhysicalSgprs; if (stage == MESA_SHADER_COMPUTE) { - unsigned *local_size = variant->nir->info.cs.local_size; + unsigned *local_size = variant->info.cs.block_size; unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2]; statistics.numAvailableVgprs = statistics.numPhysicalVgprs /