From: Kenneth Graunke Date: Thu, 9 Jun 2016 05:21:22 +0000 (-0700) Subject: i965: Fix shared local memory size for Gen9+. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=87d062a94080373995170f51063a9649c96c6dea;p=mesa.git i965: Fix shared local memory size for Gen9+. Skylake changes the representation of shared local memory size: Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | ------------------------------------------------------------------- Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | ------------------------------------------------------------------- Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | The old formula would substantially underallocate the amount of space. This fixes GPU hangs on Skylake when running with full thread counts. v2: Fix the Vulkan driver too, use a helper function, and fix the table in the comments and commit message. Cc: "12.0" Signed-off-by: Kenneth Graunke Reviewed-by: Francisco Jerez Reviewed-by: Jordan Justen --- diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index ffd1571c507..ba4c8b6b0a7 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -241,15 +241,7 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) } } - assert(prog_data->total_shared <= 64 * 1024); - uint32_t slm_size = 0; - if (prog_data->total_shared > 0) { - /* slm_size is in 4k increments, but must be a power of 2. */ - slm_size = 4 * 1024; - while (slm_size < prog_data->total_shared) - slm_size <<= 1; - slm_size /= 4 * 1024; - } + const uint32_t slm_size = encode_slm_size(GEN_GEN, prog_data->total_shared); struct anv_state state = anv_state_pool_emit(&device->dynamic_state_pool, diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index 1fdc7d29ea0..df4036acbd4 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -326,15 +326,7 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) } } - assert(prog_data->total_shared <= 64 * 1024); - uint32_t slm_size = 0; - if (prog_data->total_shared > 0) { - /* slm_size is in 4k increments, but must be a power of 2. */ - slm_size = 4 * 1024; - while (slm_size < prog_data->total_shared) - slm_size <<= 1; - slm_size /= 4 * 1024; - } + const uint32_t slm_size = encode_slm_size(GEN_GEN, prog_data->total_shared); struct anv_state state = anv_state_pool_emit(&device->dynamic_state_pool, diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index f55b7f37ffe..c944effda0a 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -26,6 +26,7 @@ #include #include "brw_device_info.h" #include "main/mtypes.h" +#include "main/macros.h" #ifdef __cplusplus extern "C" { @@ -831,6 +832,38 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str); +static inline uint32_t +encode_slm_size(const struct brw_device_info *devinfo, uint32_t bytes) +{ + uint32_t slm_size = 0; + + /* Shared Local Memory is specified as powers of two, and encoded in + * INTERFACE_DESCRIPTOR_DATA with the following representations: + * + * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | + * ------------------------------------------------------------------- + * Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | + * ------------------------------------------------------------------- + * Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + */ + assert(bytes <= 64 * 1024); + + if (bytes > 0) { + /* Shared Local Memory Size is specified as powers of two. */ + slm_size = util_next_power_of_two(bytes); + + if (devinfo->gen >= 9) { + /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */ + slm_size = ffs(MAX2(slm_size, 1024)) - 10; + } else { + /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */ + slm_size = MAX2(slm_size, 4096) / 4096; + } + } + + return slm_size; +} + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index 750aa2ccdf1..a71a5957191 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -45,6 +45,7 @@ brw_upload_cs_state(struct brw_context *brw) struct brw_stage_state *stage_state = &brw->cs.base; struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data; struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { brw->vtbl.emit_buffer_surface_state( @@ -147,15 +148,7 @@ brw_upload_cs_state(struct brw_context *brw) SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT); assert(cs_prog_data->threads <= brw->max_cs_threads); - assert(prog_data->total_shared <= 64 * 1024); - uint32_t slm_size = 0; - if (prog_data->total_shared > 0) { - /* slm_size is in 4k increments, but must be a power of 2. */ - slm_size = 4 * 1024; - while (slm_size < prog_data->total_shared) - slm_size <<= 1; - slm_size /= 4 * 1024; - } + const uint32_t slm_size = encode_slm_size(devinfo, prog_data->total_shared); desc[dw++] = SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) |