radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
pipeline->graphics.prim_restart_enable);
+ cmd_buffer->scratch_size_needed =
+ MAX2(cmd_buffer->scratch_size_needed,
+ pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+
+ radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
+ S_0286E8_WAVES(pipeline->max_waves) |
+ S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
cmd_buffer->state.emitted_pipeline = pipeline;
}
free(up);
}
+ cmd_buffer->scratch_size_needed = 0;
+ cmd_buffer->compute_scratch_size_needed = 0;
if (cmd_buffer->upload.upload_bo)
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
cmd_buffer->upload.upload_bo, 8);
radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
+
+ cmd_buffer->compute_scratch_size_needed =
+ MAX2(cmd_buffer->compute_scratch_size_needed,
+ pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+
/* change these once we have scratch support */
radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
- S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
+ S_00B860_WAVES(pipeline->max_waves) |
+ S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
radeon_emit(cmd_buffer->cs,
for (uint32_t i = 0; i < commandBufferCount; i++) {
RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
+ primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
+ secondary->scratch_size_needed);
+ primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
+ secondary->compute_scratch_size_needed);
+
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
}
}
}
+#if HAVE_LLVM < 0x0400
+ device->llvm_supports_spill = false;
+#else
+ device->llvm_supports_spill = true;
+#endif
+
+ /* The maximum number of scratch waves. Scratch space isn't divided
+ * evenly between CUs. The number is only a function of the number of CUs.
+ * We can decrease the constant to decrease the scratch buffer size.
+ *
+ * sctx->scratch_waves must be >= the maximum posible size of
+ * 1 threadgroup, so that the hw doesn't hang from being unable
+ * to start any.
+ *
+ * The recommended value is 4 per CU at most. Higher numbers don't
+ * bring much benefit, but they still occupy chip resources (think
+ * async compute). I've seen ~2% performance difference between 4 and 32.
+ */
+ uint32_t max_threads_per_block = 2048;
+ device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
+ max_threads_per_block / 64);
+
result = radv_device_init_meta(device);
if (result != VK_SUCCESS)
goto fail;
vk_free2(&device->alloc, pAllocator, module);
}
+
+static void
+radv_pipeline_destroy(struct radv_device *device,
+ struct radv_pipeline *pipeline,
+ const VkAllocationCallbacks* allocator)
+{
+ for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
+ if (pipeline->shaders[i])
+ radv_shader_variant_destroy(device, pipeline->shaders[i]);
+
+ vk_free2(&device->alloc, allocator, pipeline);
+}
+
void radv_DestroyPipeline(
VkDevice _device,
VkPipeline _pipeline,
if (!_pipeline)
return;
- for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
- if (pipeline->shaders[i])
- radv_shader_variant_destroy(device, pipeline->shaders[i]);
-
- vk_free2(&device->alloc, pAllocator, pipeline);
+ radv_pipeline_destroy(device, pipeline, pAllocator);
}
return variant;
}
+static VkResult
+radv_pipeline_scratch_init(struct radv_device *device,
+ struct radv_pipeline *pipeline)
+{
+ unsigned scratch_bytes_per_wave = 0;
+ unsigned max_waves = 0;
+ unsigned min_waves = 1;
+
+ for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+ if (pipeline->shaders[i]) {
+ unsigned max_stage_waves = device->scratch_waves;
+
+ scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave,
+ pipeline->shaders[i]->config.scratch_bytes_per_wave);
+
+ max_stage_waves = MIN2(max_stage_waves,
+ 4 * device->physical_device->rad_info.num_good_compute_units *
+ (256 / pipeline->shaders[i]->config.num_vgprs));
+ max_waves = MAX2(max_waves, max_stage_waves);
+ }
+ }
+
+ if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
+ unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
+ pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
+ pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
+ min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
+ }
+
+ if (scratch_bytes_per_wave)
+ max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave);
+
+ if (scratch_bytes_per_wave && max_waves < min_waves) {
+ /* Not really true at this moment, but will be true on first
+ * execution. Avoid having hanging shaders. */
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+ }
+ pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
+ pipeline->max_waves = max_waves;
+ return VK_SUCCESS;
+}
+
static uint32_t si_translate_blend_function(VkBlendOp op)
{
switch (op) {
const VkAllocationCallbacks *alloc)
{
struct radv_shader_module fs_m = {0};
+ VkResult result;
if (alloc == NULL)
alloc = &device->alloc;
radv_dump_pipeline_stats(device, pipeline);
}
- return VK_SUCCESS;
+ result = radv_pipeline_scratch_init(device, pipeline);
+ return result;
}
VkResult
result = radv_pipeline_init(pipeline, device, cache,
pCreateInfo, extra, pAllocator);
if (result != VK_SUCCESS) {
- vk_free2(&device->alloc, pAllocator, pipeline);
+ radv_pipeline_destroy(device, pipeline, pAllocator);
return result;
}
RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module);
struct radv_pipeline *pipeline;
+ VkResult result;
pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
pCreateInfo->stage.pSpecializationInfo,
pipeline->layout, NULL);
+
+ result = radv_pipeline_scratch_init(device, pipeline);
+ if (result != VK_SUCCESS) {
+ radv_pipeline_destroy(device, pipeline, pAllocator);
+ return result;
+ }
+
*pPipeline = radv_pipeline_to_handle(pipeline);
if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {