From 3ec9975555d1cc5365413ad9062f412904f944a3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 18 Oct 2016 15:20:22 +0200 Subject: [PATCH] radeonsi: eliminate trivial constant VS outputs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit These constant value VS PARAM exports: - 0,0,0,0 - 0,0,0,1 - 1,1,1,0 - 1,1,1,1 can be loaded into PS inputs using the DEFAULT_VAL field, and the VS exports can be removed from the IR to save export & parameter memory. After LLVM optimizations, analyze the IR to see which exports are equal to the ones listed above (or undef) and remove them if they are. Targeted use cases: - All DX9 eON ports always clear 10 VS outputs to 0.0 even if most of them are unused by PS (such as Witcher 2 below). - VS output arrays with unused elements that the GLSL compiler can't eliminate (such as Batman below). The shader-db deltas are quite interesting: (not from upstream si-report.py, it won't be upstreamed) PERCENTAGE DELTAS Shaders PARAM exports (affected only) batman_arkham_origins 589 -67.17 % bioshock-infinite 1769 -0.47 % dirt-showdown 548 -2.68 % dota2 1747 -3.36 % f1-2015 776 -4.94 % left_4_dead_2 1762 -0.07 % metro_2033_redux 2670 -0.43 % portal 474 -0.22 % talos_principle 324 -3.63 % warsow 176 -2.20 % witcher2 1040 -73.78 % ---------------------------------------- All affected 991 -65.37 % ... 9681 -> 3353 ---------------------------------------- Total 26725 -10.82 % ... 58490 -> 52162 v2: treat Undef as both 0 and 1 Reviewed-by: Nicolai Hähnle (v1) Tested-by: Edmondo Tommasina (v1) --- src/gallium/drivers/radeonsi/si_shader.c | 160 ++++++++++++++++++ src/gallium/drivers/radeonsi/si_shader.h | 11 ++ .../drivers/radeonsi/si_state_shaders.c | 17 +- 3 files changed, 186 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 6a42a8f5dd9..a810d9ae8fc 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -6523,6 +6523,159 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; } +/* Return true if the PARAM export has been eliminated. */ +static bool si_eliminate_const_output(struct si_shader_context *ctx, + LLVMValueRef inst, unsigned offset) +{ + struct si_shader *shader = ctx->shader; + unsigned num_outputs = shader->selector->info.num_outputs; + unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ + bool is_zero[4] = {}, is_one[4] = {}; + + for (i = 0; i < 4; i++) { + LLVMBool loses_info; + LLVMValueRef p = LLVMGetOperand(inst, 5 + i); + if (!LLVMIsConstant(p)) + return false; + + /* It's a constant expression. Undef outputs are eliminated too. */ + if (LLVMIsUndef(p)) { + is_zero[i] = true; + is_one[i] = true; + } else { + double a = LLVMConstRealGetDouble(p, &loses_info); + + if (a == 0) + is_zero[i] = true; + else if (a == 1) + is_one[i] = true; + else + return false; /* other constant */ + } + } + + /* Only certain combinations of 0 and 1 can be eliminated. */ + if (is_zero[0] && is_zero[1] && is_zero[2]) + default_val = is_zero[3] ? 0 : 1; + else if (is_one[0] && is_one[1] && is_one[2]) + default_val = is_zero[3] ? 2 : 3; + else + return false; + + /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ + LLVMInstructionEraseFromParent(inst); + + /* Change OFFSET to DEFAULT_VAL. */ + for (i = 0; i < num_outputs; i++) { + if (shader->info.vs_output_param_offset[i] == offset) { + shader->info.vs_output_param_offset[i] = + EXP_PARAM_DEFAULT_VAL_0000 + default_val; + break; + } + } + return true; +} + +struct si_vs_exports { + unsigned num; + unsigned offset[SI_MAX_VS_OUTPUTS]; + LLVMValueRef inst[SI_MAX_VS_OUTPUTS]; +}; + +static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx) +{ + struct si_shader *shader = ctx->shader; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBasicBlockRef bb; + struct si_vs_exports exports; + bool removed_any = false; + + exports.num = 0; + + if ((ctx->type == PIPE_SHADER_VERTEX && + (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.tes.as_es)) + return; + + /* Process all LLVM instructions. */ + bb = LLVMGetFirstBasicBlock(ctx->main_fn); + while (bb) { + LLVMValueRef inst = LLVMGetFirstInstruction(bb); + + while (inst) { + LLVMValueRef cur = inst; + inst = LLVMGetNextInstruction(inst); + + if (LLVMGetInstructionOpcode(cur) != LLVMCall) + continue; + + LLVMValueRef callee = LLVMGetCalledValue(cur); + LLVMValueKind kind = LLVMGetValueKind(callee); + + if (kind != LLVMFunctionValueKind) + continue; + + const char *name = LLVMGetValueName(callee); + unsigned num_args = LLVMCountParams(callee); + + /* Check if this is an export instruction. */ + if (num_args != 9 || strcmp(name, "llvm.SI.export")) + continue; + + LLVMValueRef arg = LLVMGetOperand(cur, 3); + unsigned target = LLVMConstIntGetZExtValue(arg); + + if (target < V_008DFC_SQ_EXP_PARAM) + continue; + + target -= V_008DFC_SQ_EXP_PARAM; + + /* Eliminate constant value PARAM exports. */ + if (si_eliminate_const_output(ctx, cur, target)) { + removed_any = true; + } else { + exports.offset[exports.num] = target; + exports.inst[exports.num] = cur; + exports.num++; + } + } + bb = LLVMGetNextBasicBlock(bb); + } + + /* Remove holes in export memory due to removed PARAM exports. + * This is done by renumbering all PARAM exports. + */ + if (removed_any) { + ubyte current_offset[SI_MAX_VS_OUTPUTS]; + unsigned new_count = 0; + unsigned out, i; + + /* Make a copy of the offsets. We need the old version while + * we are modifying some of them. */ + assert(sizeof(current_offset) == + sizeof(shader->info.vs_output_param_offset)); + memcpy(current_offset, shader->info.vs_output_param_offset, + sizeof(current_offset)); + + for (i = 0; i < exports.num; i++) { + unsigned offset = exports.offset[i]; + + for (out = 0; out < info->num_outputs; out++) { + if (current_offset[out] != offset) + continue; + + LLVMSetOperand(exports.inst[i], 3, + LLVMConstInt(ctx->i32, + V_008DFC_SQ_EXP_PARAM + new_count, 0)); + shader->info.vs_output_param_offset[out] = new_count; + new_count++; + break; + } + } + shader->info.nr_param_exports = new_count; + } +} + int si_compile_tgsi_shader(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *shader, @@ -6546,6 +6699,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_init_shader_ctx(&ctx, sscreen, shader, tm); ctx.is_monolithic = is_monolithic; + memset(shader->info.vs_output_param_offset, 0xff, + sizeof(shader->info.vs_output_param_offset)); + shader->info.uses_instanceid = sel->info.uses_instanceid; bld_base = &ctx.soa.bld_base; @@ -6630,6 +6786,10 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_llvm_finalize_module(&ctx, r600_extra_shader_checks(&sscreen->b, ctx.type)); + /* Post-optimization transformations. */ + si_eliminate_const_vs_outputs(&ctx); + + /* Compile to bytecode. */ r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, mod, debug, ctx.type, "TGSI shader"); if (r) { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b07210c90f5..6c7a05f01bf 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -415,6 +415,17 @@ struct si_shader_config { unsigned rsrc2; }; +enum { + /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */ + EXP_PARAM_OFFSET_0 = 0, + EXP_PARAM_OFFSET_31 = 31, + /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */ + EXP_PARAM_DEFAULT_VAL_0000 = 64, + EXP_PARAM_DEFAULT_VAL_0001, + EXP_PARAM_DEFAULT_VAL_1110, + EXP_PARAM_DEFAULT_VAL_1111, +}; + /* GCN-specific shader info. */ struct si_shader_info { ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 137a5d1c842..f59bfcd7d41 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1567,7 +1567,7 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, unsigned index, unsigned interpolate) { struct tgsi_shader_info *vsinfo = &vs->selector->info; - unsigned j, ps_input_cntl = 0; + unsigned j, offset, ps_input_cntl = 0; if (interpolate == TGSI_INTERPOLATE_CONSTANT || (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade)) @@ -1582,7 +1582,20 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, for (j = 0; j < vsinfo->num_outputs; j++) { if (name == vsinfo->output_semantic_name[j] && index == vsinfo->output_semantic_index[j]) { - ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[j]); + offset = vs->info.vs_output_param_offset[j]; + + if (offset <= EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl |= S_028644_OFFSET(offset); + } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 && + offset <= EXP_PARAM_DEFAULT_VAL_1111); + + offset -= EXP_PARAM_DEFAULT_VAL_0000; + ps_input_cntl = S_028644_OFFSET(0x20) | + S_028644_DEFAULT_VAL(offset); + } break; } } -- 2.30.2