X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_cs.c;h=990acabc80b2058b860357ded280b989b3ce91e0;hb=bc4a127d6e10318d48fa8b540b9c1ff7d62c8d29;hp=dacb25e5eeafdd2c8aaf1dd370fd4f7de23af875;hpb=6bcc5c0c75226bb89f35d7529de11182051c729e;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index dacb25e5eea..990acabc80b 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -32,6 +32,36 @@ #include "brw_program.h" #include "compiler/glsl/ir_uniform.h" +struct brw_cs_parameters +brw_cs_get_parameters(const struct brw_context *brw) +{ + assert(brw->cs.base.prog_data); + struct brw_cs_prog_data *cs_prog_data = + brw_cs_prog_data(brw->cs.base.prog_data); + + struct brw_cs_parameters params = {}; + + if (brw->compute.group_size) { + /* With ARB_compute_variable_group_size the group size is set at + * dispatch time, so we can't use the one provided by the compiler. + */ + params.group_size = brw->compute.group_size[0] * + brw->compute.group_size[1] * + brw->compute.group_size[2]; + } else { + params.group_size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * + cs_prog_data->local_size[2]; + } + + params.simd_size = + brw_cs_simd_size_for_group_size(&brw->screen->devinfo, + cs_prog_data, params.group_size); + params.threads = DIV_ROUND_UP(params.group_size, params.simd_size); + + return params; +} + static void assign_cs_binding_table_offsets(const struct gen_device_info *devinfo, const struct gl_program *prog, @@ -55,15 +85,15 @@ brw_codegen_cs_prog(struct brw_context *brw, const struct gen_device_info *devinfo = &brw->screen->devinfo; const GLuint *program; void *mem_ctx = ralloc_context(NULL); - GLuint program_size; struct brw_cs_prog_data prog_data; bool start_busy = false; double start_time = 0; + nir_shader *nir = nir_shader_clone(mem_ctx, cp->program.nir); memset(&prog_data, 0, sizeof(prog_data)); if (cp->program.info.cs.shared_size > 64 * 1024) { - cp->program.sh.data->LinkStatus = linking_failure; + cp->program.sh.data->LinkStatus = LINKING_FAILURE; const char *error_str = "Compute shader used more than 64KB of shared variables"; ralloc_strcat(&cp->program.sh.data->InfoLog, error_str); @@ -71,24 +101,12 @@ brw_codegen_cs_prog(struct brw_context *brw, ralloc_free(mem_ctx); return false; - } else { - prog_data.base.total_shared = cp->program.info.cs.shared_size; } assign_cs_binding_table_offsets(devinfo, &cp->program, &prog_data); - /* Allocate the references to the uniforms that will end up in the - * prog_data associated with the compiled program, and which will be freed - * by the state cache. - */ - int param_count = cp->program.nir->num_uniforms / 4; - - prog_data.base.param = rzalloc_array(NULL, uint32_t, param_count); - prog_data.base.pull_param = rzalloc_array(NULL, uint32_t, param_count); - prog_data.base.nr_params = param_count; - - brw_nir_setup_glsl_uniforms(cp->program.nir, &cp->program,&prog_data.base, - true); + brw_nir_setup_glsl_uniforms(mem_ctx, nir, + &cp->program, &prog_data.base, true); if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && @@ -100,12 +118,13 @@ brw_codegen_cs_prog(struct brw_context *brw, if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, &cp->program, ST_CS, true); + brw_nir_lower_cs_intrinsics(nir); + char *error_str; program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key, - &prog_data, cp->program.nir, st_index, - &program_size, &error_str); + &prog_data, nir, st_index, NULL, &error_str); if (program == NULL) { - cp->program.sh.data->LinkStatus = linking_failure; + cp->program.sh.data->LinkStatus = LINKING_FAILURE; ralloc_strcat(&cp->program.sh.data->InfoLog, error_str); _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); @@ -115,7 +134,8 @@ brw_codegen_cs_prog(struct brw_context *brw, if (unlikely(brw->perf_debug)) { if (cp->compiled_once) { - _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); + brw_debug_recompile(brw, MESA_SHADER_COMPUTE, cp->program.Id, + &key->base); } cp->compiled_once = true; @@ -125,33 +145,14 @@ brw_codegen_cs_prog(struct brw_context *brw, } } - const unsigned subslices = MAX2(brw->screen->subslice_total, 1); - - /* WaCSScratchSize:hsw - * - * Haswell's scratch space address calculation appears to be sparse - * rather than tightly packed. The Thread ID has bits indicating - * which subslice, EU within a subslice, and thread within an EU - * it is. There's a maximum of two slices and two subslices, so these - * can be stored with a single bit. Even though there are only 10 EUs - * per subslice, this is stored in 4 bits, so there's an effective - * maximum value of 16 EUs. Similarly, although there are only 7 - * threads per EU, this is stored in a 3 bit number, giving an effective - * maximum value of 8 threads per EU. - * - * This means that we need to use 16 * 8 instead of 10 * 7 for the - * number of threads per subslice. - */ - const unsigned scratch_ids_per_subslice = - devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads; - - brw_alloc_stage_scratch(brw, &brw->cs.base, - prog_data.base.total_scratch, - scratch_ids_per_subslice * subslices); + brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch); + /* The param and pull_param arrays will be freed by the shader cache. */ + ralloc_steal(NULL, prog_data.base.param); + ralloc_steal(NULL, prog_data.base.pull_param); brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG, key, sizeof(*key), - program, program_size, + program, prog_data.base.program_size, &prog_data, sizeof(prog_data), &brw->cs.base.prog_offset, &brw->cs.base.prog_data); ralloc_free(mem_ctx); @@ -160,22 +161,18 @@ brw_codegen_cs_prog(struct brw_context *brw, } -static void +void brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_COMPUTE_PROGRAM */ const struct brw_program *cp = (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE]; - const struct gl_program *prog = (struct gl_program *) cp; memset(key, 0, sizeof(*key)); /* _NEW_TEXTURE */ - brw_populate_sampler_prog_key_data(ctx, prog, &key->tex); - - /* The unique compute program ID */ - key->program_string_id = cp->id; + brw_populate_base_prog_key(ctx, cp, &key->base); } @@ -198,16 +195,30 @@ brw_upload_cs_prog(struct brw_context *brw) brw_cs_populate_key(brw, &key); - if (!brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG, - &key, sizeof(key), - &brw->cs.base.prog_offset, - &brw->cs.base.prog_data)) { - bool success = brw_codegen_cs_prog(brw, cp, &key); - (void) success; - assert(success); - } + if (brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG, &key, sizeof(key), + &brw->cs.base.prog_offset, &brw->cs.base.prog_data, + true)) + return; + + if (brw_disk_cache_upload_program(brw, MESA_SHADER_COMPUTE)) + return; + + cp = (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE]; + cp->id = key.base.program_string_id; + + ASSERTED bool success = brw_codegen_cs_prog(brw, cp, &key); + assert(success); } +void +brw_cs_populate_default_key(const struct brw_compiler *compiler, + struct brw_cs_prog_key *key, + struct gl_program *prog) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + memset(key, 0, sizeof(*key)); + brw_populate_default_base_prog_key(devinfo, brw_program(prog), &key->base); +} bool brw_cs_precompile(struct gl_context *ctx, struct gl_program *prog) @@ -217,10 +228,7 @@ brw_cs_precompile(struct gl_context *ctx, struct gl_program *prog) struct brw_program *bcp = brw_program(prog); - memset(&key, 0, sizeof(key)); - key.program_string_id = bcp->id; - - brw_setup_tex_for_precompile(brw, &key.tex, prog); + brw_cs_populate_default_key(brw->screen->compiler, &key, prog); uint32_t old_prog_offset = brw->cs.base.prog_offset; struct brw_stage_prog_data *old_prog_data = brw->cs.base.prog_data;