radv: Track scratch usage across pipelines & command buffers.

[mesa.git] / src / amd / vulkan / radv_pipeline.c
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c

index 75785ec921d040ff59869fada72ed879f53b4a37..e332877e2ba6b11633a11e44db32281533a8f875 100644 (file)
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -104,6 +104,19 @@ void radv_DestroyShaderModule(
         vk_free2(&device->alloc, pAllocator, module);
  }
  
+
+static void
+radv_pipeline_destroy(struct radv_device *device,
+                      struct radv_pipeline *pipeline,
+                      const VkAllocationCallbacks* allocator)
+{
+       for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
+               if (pipeline->shaders[i])
+                       radv_shader_variant_destroy(device, pipeline->shaders[i]);
+
+       vk_free2(&device->alloc, allocator, pipeline);
+}
+
  void radv_DestroyPipeline(
         VkDevice                                    _device,
         VkPipeline                                  _pipeline,
@@ -115,11 +128,7 @@ void radv_DestroyPipeline(
         if (!_pipeline)
                 return;
  
-       for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
-               if (pipeline->shaders[i])
-                       radv_shader_variant_destroy(device, pipeline->shaders[i]);
-
-       vk_free2(&device->alloc, pAllocator, pipeline);
+       radv_pipeline_destroy(device, pipeline, pAllocator);
  }
  
  
@@ -188,7 +197,10 @@ radv_shader_compile_to_nir(struct radv_device *device,
                                 assert(data + entry.size <= spec_info->pData + spec_info->dataSize);
  
                                 spec_entries[i].id = spec_info->pMapEntries[i].constantID;
-                               spec_entries[i].data = *(const uint32_t *)data;
+                               if (spec_info->dataSize == 8)
+                                       spec_entries[i].data64 = *(const uint64_t *)data;
+                               else
+                                       spec_entries[i].data32 = *(const uint32_t *)data;
                         }
                 }
                 const struct nir_spirv_supported_extensions supported_ext = {
@@ -202,11 +214,13 @@ radv_shader_compile_to_nir(struct radv_device *device,
  
                 free(spec_entries);
  
-               nir_lower_returns(nir);
-               nir_validate_shader(nir);
-
-               nir_inline_functions(nir);
-               nir_validate_shader(nir);
+               /* We have to lower away local constant initializers right before we
+                * inline functions.  That way they get properly initialized at the top
+                * of the function and not at the top of its caller.
+                */
+               NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_local);
+               NIR_PASS_V(nir, nir_lower_returns);
+               NIR_PASS_V(nir, nir_inline_functions);
  
                 /* Pick off the single entrypoint that we want */
                 foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
@@ -216,13 +230,14 @@ radv_shader_compile_to_nir(struct radv_device *device,
                 assert(exec_list_length(&nir->functions) == 1);
                 entry_point->name = ralloc_strdup(entry_point, "main");
  
-               nir_remove_dead_variables(nir, nir_var_shader_in);
-               nir_remove_dead_variables(nir, nir_var_shader_out);
-               nir_remove_dead_variables(nir, nir_var_system_value);
-               nir_validate_shader(nir);
+               NIR_PASS_V(nir, nir_remove_dead_variables,
+                          nir_var_shader_in | nir_var_shader_out | nir_var_system_value);
  
-               nir_lower_system_values(nir);
-               nir_validate_shader(nir);
+               /* Now that we've deleted all but the main function, we can go ahead and
+                * lower the rest of the constant initializers.
+                */
+               NIR_PASS_V(nir, nir_lower_constant_initializers, ~0);
+               NIR_PASS_V(nir, nir_lower_system_values);
         }
  
         /* Vulkan uses the separate-shader linking model */
@@ -272,7 +287,7 @@ static const char *radv_get_shader_name(struct radv_shader_variant *var,
  }
  static void radv_dump_pipeline_stats(struct radv_device *device, struct radv_pipeline *pipeline)
  {
-       unsigned lds_increment = device->instance->physicalDevice.rad_info.chip_class >= CIK ? 512 : 256;
+       unsigned lds_increment = device->physical_device->rad_info.chip_class >= CIK ? 512 : 256;
         struct radv_shader_variant *var;
         struct ac_shader_config *conf;
         int i;
@@ -293,7 +308,7 @@ static void radv_dump_pipeline_stats(struct radv_device *device, struct radv_pip
                 }
  
                 if (conf->num_sgprs) {
-                       if (device->instance->physicalDevice.rad_info.chip_class >= VI)
+                       if (device->physical_device->rad_info.chip_class >= VI)
                                 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
                         else
                                 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
@@ -403,7 +418,7 @@ static struct radv_shader_variant *radv_shader_variant_create(struct radv_device
                                                               bool dump)
  {
         struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant));
-       enum radeon_family chip_family = device->instance->physicalDevice.rad_info.family;
+       enum radeon_family chip_family = device->physical_device->rad_info.family;
         LLVMTargetMachineRef tm;
         if (!variant)
                 return NULL;
@@ -415,10 +430,11 @@ static struct radv_shader_variant *radv_shader_variant_create(struct radv_device
  
         struct ac_shader_binary binary;
  
-       options.unsafe_math = env_var_as_boolean("RADV_UNSAFE_MATH", false);
+       options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
         options.family = chip_family;
-       options.chip_class = device->instance->physicalDevice.rad_info.chip_class;
-       tm = ac_create_target_machine(chip_family);
+       options.chip_class = device->physical_device->rad_info.chip_class;
+       options.supports_spill = false;
+       tm = ac_create_target_machine(chip_family, false);
         ac_compile_nir_shader(tm, &binary, &variant->config,
                               &variant->info, shader, &options, dump);
         LLVMDisposeTargetMachine(tm);
@@ -448,14 +464,14 @@ radv_pipeline_compile(struct radv_pipeline *pipeline,
                       gl_shader_stage stage,
                       const VkSpecializationInfo *spec_info,
                       struct radv_pipeline_layout *layout,
-                     const union ac_shader_variant_key *key,
-                     bool dump)
+                     const union ac_shader_variant_key *key)
  {
         unsigned char sha1[20];
         struct radv_shader_variant *variant;
         nir_shader *nir;
         void *code = NULL;
         unsigned code_size = 0;
+       bool dump = (pipeline->device->debug_flags & RADV_DEBUG_DUMP_SHADERS);
  
         if (module->nir)
                 _mesa_sha1_compute(module->nir->info->name,
@@ -492,6 +508,48 @@ radv_pipeline_compile(struct radv_pipeline *pipeline,
         return variant;
  }
  
+static VkResult
+radv_pipeline_scratch_init(struct radv_device *device,
+                           struct radv_pipeline *pipeline)
+{
+       unsigned scratch_bytes_per_wave = 0;
+       unsigned max_waves = 0;
+       unsigned min_waves = 1;
+
+       for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+               if (pipeline->shaders[i]) {
+                       unsigned max_stage_waves = device->scratch_waves;
+
+                       scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave,
+                                                     pipeline->shaders[i]->config.scratch_bytes_per_wave);
+
+                       max_stage_waves = MIN2(max_stage_waves,
+                                 4 * device->physical_device->rad_info.num_good_compute_units *
+                                 (256 / pipeline->shaders[i]->config.num_vgprs));
+                       max_waves = MAX2(max_waves, max_stage_waves);
+               }
+       }
+
+       if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
+               unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
+                                     pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
+                                     pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
+               min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
+       }
+
+       if (scratch_bytes_per_wave)
+               max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave);
+
+       if (scratch_bytes_per_wave && max_waves < min_waves) {
+               /* Not really true at this moment, but will be true on first
+                * execution. Avoid having hanging shaders. */
+               return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+       }
+       pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
+       pipeline->max_waves = max_waves;
+       return VK_SUCCESS;
+}
+
  static uint32_t si_translate_blend_function(VkBlendOp op)
  {
         switch (op) {
@@ -1028,7 +1086,7 @@ radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
         const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState;
         struct radv_blend_state *blend = &pipeline->graphics.blend;
         struct radv_multisample_state *ms = &pipeline->graphics.ms;
-       unsigned num_tile_pipes = pipeline->device->instance->physicalDevice.rad_info.num_tile_pipes;
+       unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes;
         int ps_iter_samples = 1;
         uint32_t mask = 0xffff;
  
@@ -1306,8 +1364,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                    const VkAllocationCallbacks *alloc)
  {
         struct radv_shader_module fs_m = {0};
+       VkResult result;
  
-       bool dump = getenv("RADV_DUMP_SHADERS");
         if (alloc == NULL)
                 alloc = &device->alloc;
  
@@ -1334,7 +1392,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                                                pStages[MESA_SHADER_VERTEX]->pName,
                                                MESA_SHADER_VERTEX,
                                                pStages[MESA_SHADER_VERTEX]->pSpecializationInfo,
-                                              pipeline->layout, &key, dump);
+                                              pipeline->layout, &key);
  
                 pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_VERTEX);
         }
@@ -1359,7 +1417,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                                                stage ? stage->pName : "main",
                                                MESA_SHADER_FRAGMENT,
                                                stage ? stage->pSpecializationInfo : NULL,
-                                              pipeline->layout, &key, dump);
+                                              pipeline->layout, &key);
                 pipeline->active_stages |= mesa_to_vk_shader_stage(MESA_SHADER_FRAGMENT);
         }
  
@@ -1411,11 +1469,12 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                 pipeline->binding_stride[desc->binding] = desc->stride;
         }
  
-       if (device->shader_stats_dump) {
+       if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
                 radv_dump_pipeline_stats(device, pipeline);
         }
  
-       return VK_SUCCESS;
+       result = radv_pipeline_scratch_init(device, pipeline);
+       return result;
  }
  
  VkResult
@@ -1441,7 +1500,7 @@ radv_graphics_pipeline_create(
         result = radv_pipeline_init(pipeline, device, cache,
                                     pCreateInfo, extra, pAllocator);
         if (result != VK_SUCCESS) {
-               vk_free2(&device->alloc, pAllocator, pipeline);
+               radv_pipeline_destroy(device, pipeline, pAllocator);
                 return result;
         }
  
@@ -1487,7 +1546,7 @@ static VkResult radv_compute_pipeline_create(
         RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
         RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module);
         struct radv_pipeline *pipeline;
-       bool dump = getenv("RADV_DUMP_SHADERS");
+       VkResult result;
  
         pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -1503,11 +1562,18 @@ static VkResult radv_compute_pipeline_create(
                                        pCreateInfo->stage.pName,
                                        MESA_SHADER_COMPUTE,
                                        pCreateInfo->stage.pSpecializationInfo,
-                                      pipeline->layout, NULL, dump);
+                                      pipeline->layout, NULL);
+
+
+       result = radv_pipeline_scratch_init(device, pipeline);
+       if (result != VK_SUCCESS) {
+               radv_pipeline_destroy(device, pipeline, pAllocator);
+               return result;
+       }
  
         *pPipeline = radv_pipeline_to_handle(pipeline);
  
-       if (device->shader_stats_dump) {
+       if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
                 radv_dump_pipeline_stats(device, pipeline);
         }
         return VK_SUCCESS;