From e79bb746ab8525d56f71948ebf3d6dbf8a3bdd4d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 27 Jan 2016 00:50:29 +0100 Subject: [PATCH] radeonsi: add PS epilog MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 290 ++++++++++++++++++++++- src/gallium/drivers/radeonsi/si_shader.h | 7 + 4 files changed, 297 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 4415f198c90..3c87dd690f7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -542,6 +542,7 @@ static void si_destroy_screen(struct pipe_screen* pscreen) sscreen->vs_prologs, sscreen->vs_epilogs, sscreen->tcs_epilogs, + sscreen->ps_epilogs }; unsigned i; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d9175b932ab..5d204ec6462 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -92,6 +92,7 @@ struct si_screen { struct si_shader_part *vs_prologs; struct si_shader_part *vs_epilogs; struct si_shader_part *tcs_epilogs; + struct si_shader_part *ps_epilogs; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index bcb5c17d39e..8e50d06b30f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -131,6 +131,11 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, LLVMTargetMachineRef tm, struct tgsi_shader_info *info); +/* Ideally pass the sample mask input to the PS epilog as v13, which + * is its usual location, so that the shader doesn't have to add v_mov. + */ +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 + /* The VS location of the PrimitiveID input is the same in the epilog, * so that the main shader part doesn't have to move it. */ @@ -2530,6 +2535,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) si_export_mrt_z(bld_base, depth, stencil, samplemask); } +/** + * Return PS outputs in this order: + * + * v[0:3] = color0.xyzw + * v[4:7] = color1.xyzw + * ... + * vN+0 = Depth + * vN+1 = Stencil + * vN+2 = SampleMask + * vN+3 = SampleMaskIn (used for OpenGL smoothing) + * + * The alpha-ref SGPR is returned via its original location. + */ +static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][2], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][1], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][0], ""); + break; + default: + fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", + semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue(builder, ret, + bitcast(bld_base, TGSI_TYPE_SIGNED, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); + + ctx->return_value = ret; +} + static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); @@ -3723,7 +3822,7 @@ static void create_function(struct si_shader_context *ctx) struct si_shader *shader = ctx->shader; LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; LLVMTypeRef returns[16+32*4]; - unsigned i, last_array_pointer, last_sgpr, num_params; + unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs; unsigned num_returns = 0; v3i32 = LLVMVectorType(ctx->i32, 3); @@ -3869,6 +3968,27 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; params[SI_PARAM_POS_FIXED_PT] = ctx->f32; num_params = SI_PARAM_POS_FIXED_PT+1; + + if (!ctx->is_monolithic) { + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->i32; + for (; i < num_returns; i++) + returns[i] = ctx->f32; + } break; default: @@ -4664,7 +4784,10 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen, break; case TGSI_PROCESSOR_FRAGMENT: ctx.radeon_bld.load_input = declare_input_fs; - bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + if (is_monolithic) + bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + else + bld_base->emit_epilogue = si_llvm_return_fs_outputs; break; default: assert(!"Unsupported shader type"); @@ -5181,6 +5304,159 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, return shader->epilog != NULL; } +/** + * Compile the pixel shader epilog. This handles everything that must be + * emulated for pixel shader exports. (alpha-test, format conversions, etc) + */ +static bool si_compile_ps_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16+8*4+3]; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int last_array_pointer, last_sgpr, num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.epilog = key->ps_epilog.states; + + /* Declare input SGPRs. */ + params[SI_PARAM_RW_BUFFERS] = ctx.i64; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_ALPHA_REF] = ctx.f32; + last_array_pointer = -1; + last_sgpr = SI_PARAM_ALPHA_REF; + + /* Declare input VGPRs. */ + num_params = (last_sgpr + 1) + + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + + key->ps_epilog.writes_stencil + + key->ps_epilog.writes_samplemask; + + num_params = MAX2(num_params, + last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + assert(num_params <= ARRAY_SIZE(params)); + + for (i = last_sgpr + 1; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + /* Disable elimination of unused inputs. */ + radeon_llvm_add_attribute(ctx.radeon_bld.main_fn, + "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = last_sgpr + 1; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && + !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && + (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + si_export_mrt_color(bld_base, color, mrt, + num_params - 1, + mrt == last_color_export); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(bld_base, depth, stencil, samplemask); + else if (last_color_export == -1) + si_export_null(bld_base); + + /* Compile. */ + LLVMBuildRetVoid(gallivm->builder); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) pixel shader parts (prolog & epilog). + */ +static bool si_shader_select_ps_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key epilog_key; + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.ps_epilog.colors_written = info->colors_written; + epilog_key.ps_epilog.writes_z = info->writes_z; + epilog_key.ps_epilog.writes_stencil = info->writes_stencil; + epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask; + epilog_key.ps_epilog.states = shader->key.ps.epilog; + + shader->epilog = + si_get_shader_part(sscreen, &sscreen->ps_epilogs, + &epilog_key, tm, debug, + si_compile_ps_epilog); + if (!shader->epilog) + return false; + + /* The sample mask input is always enabled, because the API shader always + * passes it through to the epilog. Disable it here if it's unused. + */ + if (!shader->key.ps.epilog.poly_line_smoothing && + !shader->selector->info.reads_samplemask) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + + return true; +} + int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *shader, struct pipe_debug_callback *debug) @@ -5207,6 +5483,16 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) return -1; break; + case PIPE_SHADER_FRAGMENT: + if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) + return -1; + + /* Make sure we have at least as many VGPRs as there + * are allocated inputs. + */ + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->num_input_vgprs); + break; } /* Update SGPR and VGPR counts. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index e2b9534683e..928cb2e18e9 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -281,6 +281,13 @@ union si_shader_part_key { struct { struct si_tcs_epilog_bits states; } tcs_epilog; + struct { + struct si_ps_epilog_bits states; + unsigned colors_written:8; + unsigned writes_z:1; + unsigned writes_stencil:1; + unsigned writes_samplemask:1; + } ps_epilog; }; union si_shader_key { -- 2.30.2