From: Jordan Justen Date: Thu, 28 Aug 2014 22:27:22 +0000 (-0700) Subject: i965/cs: Emit compute shader code and upload programs X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=932045061b5850368e8a4a5b3e6609eba6ed8d66;p=mesa.git i965/cs: Emit compute shader code and upload programs v2: * Don't bother checking for 'gen > 5' (krh) * Populate sampler data in key (krh) v3: * Drop no8 support, and simplify code in several places (Ken) Signed-off-by: Jordan Justen Reviewed-by: Kenneth Graunke --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index c0a2fc1811f..10e954362a8 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -148,6 +148,7 @@ struct brw_vs_prog_key; struct brw_vue_prog_key; struct brw_wm_prog_key; struct brw_wm_prog_data; +struct brw_cs_prog_key; struct brw_cs_prog_data; enum brw_pipeline { diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp index 80211475fb5..1b88fd40f3a 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.cpp +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp @@ -22,8 +22,15 @@ */ +#include "util/ralloc.h" #include "brw_context.h" #include "brw_cs.h" +#include "brw_fs.h" +#include "brw_eu.h" +#include "brw_wm.h" +#include "intel_mipmap_tree.h" +#include "brw_state.h" +#include "intel_batchbuffer.h" extern "C" bool @@ -46,3 +53,204 @@ brw_cs_prog_data_compare(const void *in_a, const void *in_b) return true; } + + +static const unsigned * +brw_cs_emit(struct brw_context *brw, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + struct gl_compute_program *cp, + struct gl_shader_program *prog, + unsigned *final_assembly_size) +{ + bool start_busy = false; + double start_time = 0; + + if (unlikely(brw->perf_debug)) { + start_busy = (brw->batch.last_bo && + drm_intel_bo_busy(brw->batch.last_bo)); + start_time = get_time(); + } + + struct brw_shader *shader = + (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE]; + + if (unlikely(INTEL_DEBUG & DEBUG_CS)) + brw_dump_ir("compute", prog, &shader->base, &cp->Base); + + prog_data->local_size[0] = cp->LocalSize[0]; + prog_data->local_size[1] = cp->LocalSize[1]; + prog_data->local_size[2] = cp->LocalSize[2]; + int local_workgroup_size = + cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2]; + + cfg_t *cfg = NULL; + const char *fail_msg = NULL; + + /* Now the main event: Visit the shader IR and generate our CS IR for it. + */ + fs_visitor v8(brw, mem_ctx, key, prog_data, prog, cp, 8); + if (!v8.run_cs()) { + fail_msg = v8.fail_msg; + } else if (local_workgroup_size <= 8 * brw->max_cs_threads) { + cfg = v8.cfg; + prog_data->simd_size = 8; + } + + fs_visitor v16(brw, mem_ctx, key, prog_data, prog, cp, 16); + if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && + !fail_msg && !v8.simd16_unsupported && + local_workgroup_size <= 16 * brw->max_cs_threads) { + /* Try a SIMD16 compile */ + v16.import_uniforms(&v8); + if (!v16.run_cs()) { + perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg); + if (!cfg) { + fail_msg = + "Couldn't generate SIMD16 program and not " + "enough threads for SIMD8"; + } + } else { + cfg = v16.cfg; + prog_data->simd_size = 16; + } + } + + if (unlikely(cfg == NULL)) { + assert(fail_msg); + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, fail_msg); + _mesa_problem(NULL, "Failed to compile compute shader: %s\n", + fail_msg); + return NULL; + } + + fs_generator g(brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base, + v8.promoted_constants, v8.runtime_check_aads_emit, "CS"); + if (INTEL_DEBUG & DEBUG_CS) { + char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d", + prog->Label ? prog->Label : "unnamed", + prog->Name); + g.enable_debug(name); + } + + g.generate_code(cfg, prog_data->simd_size); + + if (unlikely(brw->perf_debug) && shader) { + if (shader->compiled_once) { + _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); + } + shader->compiled_once = true; + + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { + perf_debug("CS compile took %.03f ms and stalled the GPU\n", + (get_time() - start_time) * 1000); + } + } + + return g.get_assembly(final_assembly_size); +} + +static bool +brw_codegen_cs_prog(struct brw_context *brw, + struct gl_shader_program *prog, + struct brw_compute_program *cp, + struct brw_cs_prog_key *key) +{ + struct gl_context *ctx = &brw->ctx; + const GLuint *program; + void *mem_ctx = ralloc_context(NULL); + GLuint program_size; + struct brw_cs_prog_data prog_data; + + struct gl_shader *cs = prog->_LinkedShaders[MESA_SHADER_COMPUTE]; + assert (cs); + + memset(&prog_data, 0, sizeof(prog_data)); + + /* Allocate the references to the uniforms that will end up in the + * prog_data associated with the compiled program, and which will be freed + * by the state cache. + */ + int param_count = cs->num_uniform_components; + + /* The backend also sometimes adds params for texture size. */ + param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits; + prog_data.base.param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.pull_param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.nr_params = param_count; + + program = brw_cs_emit(brw, mem_ctx, key, &prog_data, + &cp->program, prog, &program_size); + if (program == NULL) { + ralloc_free(mem_ctx); + return false; + } + + if (prog_data.base.total_scratch) { + brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo, + prog_data.base.total_scratch * brw->max_cs_threads); + } + + if (unlikely(INTEL_DEBUG & DEBUG_CS)) + fprintf(stderr, "\n"); + + brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG, + key, sizeof(*key), + program, program_size, + &prog_data, sizeof(prog_data), + &brw->cs.base.prog_offset, &brw->cs.prog_data); + ralloc_free(mem_ctx); + + return true; +} + + +static void +brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key) +{ + struct gl_context *ctx = &brw->ctx; + /* BRW_NEW_COMPUTE_PROGRAM */ + const struct brw_compute_program *cp = + (struct brw_compute_program *) brw->compute_program; + const struct gl_program *prog = (struct gl_program *) cp; + + memset(key, 0, sizeof(*key)); + + /* The unique compute program ID */ + key->program_string_id = cp->id; +} + + +extern "C" +void +brw_upload_cs_prog(struct brw_context *brw) +{ + struct gl_context *ctx = &brw->ctx; + struct brw_cs_prog_key key; + struct brw_compute_program *cp = (struct brw_compute_program *) + brw->compute_program; + + if (!cp) + return; + + if (!brw_state_dirty(brw, 0, BRW_NEW_COMPUTE_PROGRAM)) + return; + + brw_cs_populate_key(brw, &key); + + if (!brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG, + &key, sizeof(key), + &brw->cs.base.prog_offset, &brw->cs.prog_data)) { + bool success = + brw_codegen_cs_prog(brw, + ctx->Shader.CurrentProgram[MESA_SHADER_COMPUTE], + cp, &key); + (void) success; + assert(success); + } + brw->cs.base.prog_data = &brw->cs.prog_data->base; +} diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index 5c5420d2e82..d086f3983c0 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -40,6 +40,7 @@ #include "brw_ff_gs.h" #include "brw_gs.h" #include "brw_wm.h" +#include "brw_cs.h" static const struct brw_tracked_state *gen4_atoms[] = { @@ -618,6 +619,8 @@ brw_upload_programs(struct brw_context *brw, brw_upload_gs_prog(brw); brw_upload_wm_prog(brw); + } else if (pipeline == BRW_COMPUTE_PIPELINE) { + brw_upload_cs_prog(brw); } }