From f9d5a7add42af5a2e4410526d1480a08f41317ae Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Tue, 31 Oct 2017 00:34:32 -0700 Subject: [PATCH] i965: Calculate thread_count in brw_alloc_stage_scratch Previously, thread_count was sent in from the stage after some stage specific calculations. Those stage specific calculations were moved into brw_alloc_stage_scratch, which will allow the shader cache to also use the same calculations. Signed-off-by: Jordan Justen Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_context.h | 3 +- src/mesa/drivers/dri/i965/brw_cs.c | 24 +--------- src/mesa/drivers/dri/i965/brw_gs.c | 3 +- src/mesa/drivers/dri/i965/brw_program.c | 64 +++++++++++++++++++++---- src/mesa/drivers/dri/i965/brw_tcs.c | 3 +- src/mesa/drivers/dri/i965/brw_tes.c | 3 +- src/mesa/drivers/dri/i965/brw_vs.c | 3 +- src/mesa/drivers/dri/i965/brw_wm.c | 4 +- 8 files changed, 62 insertions(+), 45 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 0102f154248..3bee3e99ed2 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1345,8 +1345,7 @@ void brw_get_scratch_bo(struct brw_context *brw, struct brw_bo **scratch_bo, int size); void brw_alloc_stage_scratch(struct brw_context *brw, struct brw_stage_state *stage_state, - unsigned per_thread_size, - unsigned thread_count); + unsigned per_thread_size); void brw_init_shader_time(struct brw_context *brw); int brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog, diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index 1a0e9f62c63..1d34a8a79d1 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -114,29 +114,7 @@ brw_codegen_cs_prog(struct brw_context *brw, } } - const unsigned subslices = MAX2(brw->screen->subslice_total, 1); - - /* WaCSScratchSize:hsw - * - * Haswell's scratch space address calculation appears to be sparse - * rather than tightly packed. The Thread ID has bits indicating - * which subslice, EU within a subslice, and thread within an EU - * it is. There's a maximum of two slices and two subslices, so these - * can be stored with a single bit. Even though there are only 10 EUs - * per subslice, this is stored in 4 bits, so there's an effective - * maximum value of 16 EUs. Similarly, although there are only 7 - * threads per EU, this is stored in a 3 bit number, giving an effective - * maximum value of 8 threads per EU. - * - * This means that we need to use 16 * 8 instead of 10 * 7 for the - * number of threads per subslice. - */ - const unsigned scratch_ids_per_subslice = - devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads; - - brw_alloc_stage_scratch(brw, &brw->cs.base, - prog_data.base.total_scratch, - scratch_ids_per_subslice * subslices); + brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch); /* The param and pull_param arrays will be freed by the shader cache. */ ralloc_steal(NULL, prog_data.base.param); diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index e6e757ce686..19eab2f5332 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -138,8 +138,7 @@ brw_codegen_gs_prog(struct brw_context *brw, /* Scratch space is used for register spilling */ brw_alloc_stage_scratch(brw, stage_state, - prog_data.base.base.total_scratch, - devinfo->max_gs_threads); + prog_data.base.base.total_scratch); /* The param and pull_param arrays will be freed by the shader cache. */ ralloc_steal(NULL, prog_data.base.base.param); diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 02d095b06f6..7607bc38840 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -328,19 +328,65 @@ brw_get_scratch_bo(struct brw_context *brw, void brw_alloc_stage_scratch(struct brw_context *brw, struct brw_stage_state *stage_state, - unsigned per_thread_size, - unsigned thread_count) + unsigned per_thread_size) { - if (stage_state->per_thread_scratch < per_thread_size) { - stage_state->per_thread_scratch = per_thread_size; + if (stage_state->per_thread_scratch >= per_thread_size) + return; + + stage_state->per_thread_scratch = per_thread_size; - if (stage_state->scratch_bo) - brw_bo_unreference(stage_state->scratch_bo); + if (stage_state->scratch_bo) + brw_bo_unreference(stage_state->scratch_bo); + + const struct gen_device_info *devinfo = &brw->screen->devinfo; + unsigned thread_count; + switch(stage_state->stage) { + case MESA_SHADER_VERTEX: + thread_count = devinfo->max_vs_threads; + break; + case MESA_SHADER_TESS_CTRL: + thread_count = devinfo->max_tcs_threads; + break; + case MESA_SHADER_TESS_EVAL: + thread_count = devinfo->max_tes_threads; + break; + case MESA_SHADER_GEOMETRY: + thread_count = devinfo->max_gs_threads; + break; + case MESA_SHADER_FRAGMENT: + thread_count = devinfo->max_wm_threads; + break; + case MESA_SHADER_COMPUTE: { + const unsigned subslices = MAX2(brw->screen->subslice_total, 1); + + /* WaCSScratchSize:hsw + * + * Haswell's scratch space address calculation appears to be sparse + * rather than tightly packed. The Thread ID has bits indicating + * which subslice, EU within a subslice, and thread within an EU + * it is. There's a maximum of two slices and two subslices, so these + * can be stored with a single bit. Even though there are only 10 EUs + * per subslice, this is stored in 4 bits, so there's an effective + * maximum value of 16 EUs. Similarly, although there are only 7 + * threads per EU, this is stored in a 3 bit number, giving an effective + * maximum value of 8 threads per EU. + * + * This means that we need to use 16 * 8 instead of 10 * 7 for the + * number of threads per subslice. + */ + const unsigned scratch_ids_per_subslice = + devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads; - stage_state->scratch_bo = - brw_bo_alloc(brw->bufmgr, "shader scratch space", - per_thread_size * thread_count, 4096); + thread_count = scratch_ids_per_subslice * subslices; + break; } + default: + unreachable("Unsupported stage!"); + } + + stage_state->scratch_bo = + brw_bo_alloc(brw->bufmgr, "shader scratch space", + per_thread_size * thread_count, 4096); } void brwInitFragProgFuncs( struct dd_function_table *functions ) diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c index 5ac728ee68f..685336eb973 100644 --- a/src/mesa/drivers/dri/i965/brw_tcs.c +++ b/src/mesa/drivers/dri/i965/brw_tcs.c @@ -259,8 +259,7 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp, /* Scratch space is used for register spilling */ brw_alloc_stage_scratch(brw, stage_state, - prog_data.base.base.total_scratch, - devinfo->max_tcs_threads); + prog_data.base.base.total_scratch); /* The param and pull_param arrays will be freed by the shader cache. */ ralloc_steal(NULL, prog_data.base.base.param); diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c index 21f44ae74af..225d2401c7e 100644 --- a/src/mesa/drivers/dri/i965/brw_tes.c +++ b/src/mesa/drivers/dri/i965/brw_tes.c @@ -129,8 +129,7 @@ brw_codegen_tes_prog(struct brw_context *brw, /* Scratch space is used for register spilling */ brw_alloc_stage_scratch(brw, stage_state, - prog_data.base.base.total_scratch, - devinfo->max_tes_threads); + prog_data.base.base.total_scratch); /* The param and pull_param arrays will be freed by the shader cache. */ ralloc_steal(NULL, prog_data.base.base.param); diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index d308bb83332..de0ba79f357 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -248,8 +248,7 @@ brw_codegen_vs_prog(struct brw_context *brw, /* Scratch space is used for register spilling */ brw_alloc_stage_scratch(brw, &brw->vs.base, - prog_data.base.base.total_scratch, - devinfo->max_vs_threads); + prog_data.base.base.total_scratch); /* The param and pull_param arrays will be freed by the shader cache. */ ralloc_steal(NULL, prog_data.base.base.param); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 4144cd11ee4..34a3a1e5c16 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -209,9 +209,7 @@ brw_codegen_wm_prog(struct brw_context *brw, } } - brw_alloc_stage_scratch(brw, &brw->wm.base, - prog_data.base.total_scratch, - devinfo->max_wm_threads); + brw_alloc_stage_scratch(brw, &brw->wm.base, prog_data.base.total_scratch); if (unlikely((INTEL_DEBUG & DEBUG_WM) && fp->program.is_arb_asm)) fprintf(stderr, "\n"); -- 2.30.2