From ff360a52e6700d1085876169ed3b328910b394de Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 11 Feb 2016 15:49:34 +0100 Subject: [PATCH] radeonsi: implement binary shaders & shader cache in memory (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit v2: handle _mesa_hash_table_insert failure other cosmetic changes Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.c | 5 +- src/gallium/drivers/radeonsi/si_pipe.h | 16 ++ src/gallium/drivers/radeonsi/si_shader.h | 4 +- src/gallium/drivers/radeonsi/si_state.h | 2 + .../drivers/radeonsi/si_state_shaders.c | 239 +++++++++++++++++- 5 files changed, 259 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 30f3ec0753a..37fd4a25d59 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -564,7 +564,7 @@ static void si_destroy_screen(struct pipe_screen* pscreen) } } pipe_mutex_destroy(sscreen->shader_parts_mutex); - + si_destroy_shader_cache(sscreen); r600_destroy_common_screen(&sscreen->b); } @@ -612,7 +612,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.b.resource_create = r600_resource_create_common; if (!r600_common_screen_init(&sscreen->b, ws) || - !si_init_gs_info(sscreen)) { + !si_init_gs_info(sscreen) || + !si_init_shader_cache(sscreen)) { FREE(sscreen); return NULL; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 1ac7bc4bd85..ef860a58b83 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -80,6 +80,7 @@ #define SI_MAX_BORDER_COLORS 4096 struct si_compute; +struct hash_table; struct si_screen { struct r600_common_screen b; @@ -94,6 +95,21 @@ struct si_screen { struct si_shader_part *tcs_epilogs; struct si_shader_part *ps_prologs; struct si_shader_part *ps_epilogs; + + /* Shader cache in memory. + * + * Design & limitations: + * - The shader cache is per screen (= per process), never saved to + * disk, and skips redundant shader compilations from TGSI to bytecode. + * - It can only be used with one-variant-per-shader support, in which + * case only the main (typically middle) part of shaders is cached. + * - Only VS, TCS, TES, PS are cached, out of which only the hw VS + * variants of VS and TES are cached, so LS and ES aren't. + * - GS and CS aren't cached, but it's certainly possible to cache + * those as well. + */ + pipe_mutex shader_cache_mutex; + struct hash_table *shader_cache; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b299b7b2c0a..ff5c24d8918 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -364,8 +364,10 @@ struct si_shader { struct r600_resource *bo; struct r600_resource *scratch_bo; union si_shader_key key; - struct radeon_shader_binary binary; bool is_binary_shared; + + /* The following data is all that's needed for binary shaders. */ + struct radeon_shader_binary binary; struct si_shader_config config; struct si_shader_info info; }; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f64c4d45f1b..40792cbc1d5 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -280,6 +280,8 @@ si_create_sampler_view_custom(struct pipe_context *ctx, /* si_state_shader.c */ bool si_update_shaders(struct si_context *sctx); void si_init_shader_functions(struct si_context *sctx); +bool si_init_shader_cache(struct si_screen *sscreen); +void si_destroy_shader_cache(struct si_screen *sscreen); /* si_state_draw.c */ void si_emit_cache_flush(struct si_context *sctx, struct r600_atom *atom); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c62cbb72c91..a6753a7a528 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -32,10 +32,221 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" +#include "util/hash_table.h" +#include "util/u_hash.h" #include "util/u_memory.h" #include "util/u_prim.h" #include "util/u_simple_shaders.h" +/* SHADER_CACHE */ + +/** + * Return the TGSI binary in a buffer. The first 4 bytes contain its size as + * integer. + */ +static void *si_get_tgsi_binary(struct si_shader_selector *sel) +{ + unsigned tgsi_size = tgsi_num_tokens(sel->tokens) * + sizeof(struct tgsi_token); + unsigned size = 4 + tgsi_size + sizeof(sel->so); + char *result = (char*)MALLOC(size); + + if (!result) + return NULL; + + *((uint32_t*)result) = size; + memcpy(result + 4, sel->tokens, tgsi_size); + memcpy(result + 4 + tgsi_size, &sel->so, sizeof(sel->so)); + return result; +} + +/** Copy "data" to "ptr" and return the next dword following copied data. */ +static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size) +{ + memcpy(ptr, data, size); + ptr += DIV_ROUND_UP(size, 4); + return ptr; +} + +/** Read data from "ptr". Return the next dword following the data. */ +static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size) +{ + memcpy(data, ptr, size); + ptr += DIV_ROUND_UP(size, 4); + return ptr; +} + +/** + * Write the size as uint followed by the data. Return the next dword + * following the copied data. + */ +static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size) +{ + *ptr++ = size; + return write_data(ptr, data, size); +} + +/** + * Read the size as uint followed by the data. Return both via parameters. + * Return the next dword following the data. + */ +static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size) +{ + *size = *ptr++; + assert(*data == NULL); + *data = malloc(*size); + return read_data(ptr, *data, *size); +} + +/** + * Return the shader binary in a buffer. The first 4 bytes contain its size + * as integer. + */ +static void *si_get_shader_binary(struct si_shader *shader) +{ + /* There is always a size of data followed by the data itself. */ + unsigned relocs_size = shader->binary.reloc_count * + sizeof(shader->binary.relocs[0]); + unsigned disasm_size = strlen(shader->binary.disasm_string) + 1; + unsigned size = + 4 + /* total size */ + 4 + /* CRC32 of the data below */ + align(sizeof(shader->config), 4) + + align(sizeof(shader->info), 4) + + 4 + align(shader->binary.code_size, 4) + + 4 + align(shader->binary.rodata_size, 4) + + 4 + align(relocs_size, 4) + + 4 + align(disasm_size, 4); + void *buffer = CALLOC(1, size); + uint32_t *ptr = (uint32_t*)buffer; + + if (!buffer) + return NULL; + + *ptr++ = size; + ptr++; /* CRC32 is calculated at the end. */ + + ptr = write_data(ptr, &shader->config, sizeof(shader->config)); + ptr = write_data(ptr, &shader->info, sizeof(shader->info)); + ptr = write_chunk(ptr, shader->binary.code, shader->binary.code_size); + ptr = write_chunk(ptr, shader->binary.rodata, shader->binary.rodata_size); + ptr = write_chunk(ptr, shader->binary.relocs, relocs_size); + ptr = write_chunk(ptr, shader->binary.disasm_string, disasm_size); + assert((char *)ptr - (char *)buffer == size); + + /* Compute CRC32. */ + ptr = (uint32_t*)buffer; + ptr++; + *ptr = util_hash_crc32(ptr + 1, size - 8); + + return buffer; +} + +static bool si_load_shader_binary(struct si_shader *shader, void *binary) +{ + uint32_t *ptr = (uint32_t*)binary; + uint32_t size = *ptr++; + uint32_t crc32 = *ptr++; + unsigned chunk_size; + + if (util_hash_crc32(ptr, size - 8) != crc32) { + fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n"); + return false; + } + + ptr = read_data(ptr, &shader->config, sizeof(shader->config)); + ptr = read_data(ptr, &shader->info, sizeof(shader->info)); + ptr = read_chunk(ptr, (void**)&shader->binary.code, + &shader->binary.code_size); + ptr = read_chunk(ptr, (void**)&shader->binary.rodata, + &shader->binary.rodata_size); + ptr = read_chunk(ptr, (void**)&shader->binary.relocs, &chunk_size); + shader->binary.reloc_count = chunk_size / sizeof(shader->binary.relocs[0]); + ptr = read_chunk(ptr, (void**)&shader->binary.disasm_string, &chunk_size); + + return true; +} + +/** + * Insert a shader into the cache. It's assumed the shader is not in the cache. + * Use si_shader_cache_load_shader before calling this. + * + * Returns false on failure, in which case the tgsi_binary should be freed. + */ +static bool si_shader_cache_insert_shader(struct si_screen *sscreen, + void *tgsi_binary, + struct si_shader *shader) +{ + void *hw_binary = si_get_shader_binary(shader); + + if (!hw_binary) + return false; + + if (_mesa_hash_table_insert(sscreen->shader_cache, tgsi_binary, + hw_binary) == NULL) { + FREE(hw_binary); + return false; + } + + return true; +} + +static bool si_shader_cache_load_shader(struct si_screen *sscreen, + void *tgsi_binary, + struct si_shader *shader) +{ + struct hash_entry *entry = + _mesa_hash_table_search(sscreen->shader_cache, tgsi_binary); + if (!entry) + return false; + + return si_load_shader_binary(shader, entry->data); +} + +static uint32_t si_shader_cache_key_hash(const void *key) +{ + /* The first dword is the key size. */ + return util_hash_crc32(key, *(uint32_t*)key); +} + +static bool si_shader_cache_key_equals(const void *a, const void *b) +{ + uint32_t *keya = (uint32_t*)a; + uint32_t *keyb = (uint32_t*)b; + + /* The first dword is the key size. */ + if (*keya != *keyb) + return false; + + return memcmp(keya, keyb, *keya) == 0; +} + +static void si_destroy_shader_cache_entry(struct hash_entry *entry) +{ + FREE((void*)entry->key); + FREE(entry->data); +} + +bool si_init_shader_cache(struct si_screen *sscreen) +{ + pipe_mutex_init(sscreen->shader_cache_mutex); + sscreen->shader_cache = + _mesa_hash_table_create(NULL, + si_shader_cache_key_hash, + si_shader_cache_key_equals); + return sscreen->shader_cache != NULL; +} + +void si_destroy_shader_cache(struct si_screen *sscreen) +{ + if (sscreen->shader_cache) + _mesa_hash_table_destroy(sscreen->shader_cache, + si_destroy_shader_cache_entry); + pipe_mutex_destroy(sscreen->shader_cache_mutex); +} + +/* SHADER STATES */ + static void si_set_tesseval_regs(struct si_shader *shader, struct si_pm4_state *pm4) { @@ -936,17 +1147,37 @@ static void *si_create_shader_selector(struct pipe_context *ctx, if (sel->type != PIPE_SHADER_GEOMETRY && !sscreen->use_monolithic_shaders) { struct si_shader *shader = CALLOC_STRUCT(si_shader); + void *tgsi_binary; if (!shader) goto error; shader->selector = sel; - if (si_compile_tgsi_shader(sscreen, sctx->tm, shader, false, - &sctx->b.debug) != 0) { - FREE(shader); - goto error; + tgsi_binary = si_get_tgsi_binary(sel); + + /* Try to load the shader from the shader cache. */ + pipe_mutex_lock(sscreen->shader_cache_mutex); + + if (tgsi_binary && + si_shader_cache_load_shader(sscreen, tgsi_binary, shader)) { + FREE(tgsi_binary); + } else { + /* Compile the shader if it hasn't been loaded from the cache. */ + if (si_compile_tgsi_shader(sscreen, sctx->tm, shader, false, + &sctx->b.debug) != 0) { + FREE(shader); + FREE(tgsi_binary); + pipe_mutex_unlock(sscreen->shader_cache_mutex); + goto error; + } + + if (tgsi_binary && + !si_shader_cache_insert_shader(sscreen, tgsi_binary, shader)) + FREE(tgsi_binary); } + pipe_mutex_unlock(sscreen->shader_cache_mutex); + sel->main_shader_part = shader; } -- 2.30.2