From 4fee05b020af72ee802d4349de76fbc36cdd53a9 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 30 Nov 2012 18:29:34 -0800 Subject: [PATCH] i965/vs: Add a pass to set dependency control fields on instructions. This is a more aggressive version of the old brw_optimize() path. Reduces cycles spent in the vertex shader on minecraft by 18.6% +/- 10.0% (n=15). Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 109 ++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_vec4.h | 2 + src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 15 +++ 3 files changed, 126 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 184eff9345b..c58fb444b94 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -22,6 +22,7 @@ */ #include "brw_vec4.h" +#include "brw_cfg.h" #include "glsl/ir_print_visitor.h" extern "C" { @@ -620,6 +621,112 @@ vec4_visitor::move_push_constants_to_pull_constants() pack_uniform_registers(); } +/** + * Sets the dependency control fields on instructions after register + * allocation and before the generator is run. + * + * When you have a sequence of instructions like: + * + * DP4 temp.x vertex uniform[0] + * DP4 temp.y vertex uniform[0] + * DP4 temp.z vertex uniform[0] + * DP4 temp.w vertex uniform[0] + * + * The hardware doesn't know that it can actually run the later instructions + * while the previous ones are in flight, producing stalls. However, we have + * manual fields we can set in the instructions that let it do so. + */ +void +vec4_visitor::opt_set_dependency_control() +{ + vec4_instruction *last_grf_write[BRW_MAX_GRF]; + uint8_t grf_channels_written[BRW_MAX_GRF]; + vec4_instruction *last_mrf_write[BRW_MAX_GRF]; + uint8_t mrf_channels_written[BRW_MAX_GRF]; + + cfg_t cfg(this); + + assert(prog_data->total_grf || !"Must be called after register allocation"); + + for (int i = 0; i < cfg.num_blocks; i++) { + bblock_t *bblock = cfg.blocks[i]; + vec4_instruction *inst; + + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + + for (inst = (vec4_instruction *)bblock->start; + inst != (vec4_instruction *)bblock->end->next; + inst = (vec4_instruction *)inst->next) { + /* If we read from a register that we were doing dependency control + * on, don't do dependency control across the read. + */ + for (int i = 0; i < 3; i++) { + int reg = inst->src[i].reg + inst->src[i].reg_offset; + if (inst->src[i].file == GRF) { + last_grf_write[reg] = NULL; + } else if (inst->src[i].file == HW_REG) { + memset(last_grf_write, 0, sizeof(last_grf_write)); + break; + } + assert(inst->src[i].file != MRF); + } + + /* In the presence of send messages, totally interrupt dependency + * control. They're long enough that the chance of dependency + * control around them just doesn't matter. + */ + if (inst->mlen) { + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + continue; + } + + /* It looks like setting dependency control on a predicated + * instruction hangs the GPU. + */ + if (inst->predicate) { + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + continue; + } + + /* Now, see if we can do dependency control for this instruction + * against a previous one writing to its destination. + */ + int reg = inst->dst.reg + inst->dst.reg_offset; + if (inst->dst.file == GRF) { + if (last_grf_write[reg] && + !(inst->dst.writemask & grf_channels_written[reg])) { + last_grf_write[reg]->no_dd_clear = true; + inst->no_dd_check = true; + } else { + grf_channels_written[reg] = 0; + } + + last_grf_write[reg] = inst; + grf_channels_written[reg] |= inst->dst.writemask; + } else if (inst->dst.file == MRF) { + if (last_mrf_write[reg] && + !(inst->dst.writemask & mrf_channels_written[reg])) { + last_mrf_write[reg]->no_dd_clear = true; + inst->no_dd_check = true; + } else { + mrf_channels_written[reg] = 0; + } + + last_mrf_write[reg] = inst; + mrf_channels_written[reg] |= inst->dst.writemask; + } else if (inst->dst.reg == HW_REG) { + if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) + memset(last_grf_write, 0, sizeof(last_grf_write)); + if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE) + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + } + } + } +} + bool vec4_instruction::can_reswizzle_dst(int dst_writemask, int swizzle, @@ -1355,6 +1462,8 @@ vec4_visitor::run() break; } + opt_set_dependency_control(); + /* If any state parameters were appended, then ParameterValues could have * been realloced, in which case the driver uniform storage set up by * _mesa_associate_uniform_storage() would point to freed memory. Make diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 1f832d19cd8..8f130e15428 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -172,6 +172,7 @@ public: bool saturate; bool force_writemask_all; + bool no_dd_clear, no_dd_check; int conditional_mod; /**< BRW_CONDITIONAL_* */ @@ -337,6 +338,7 @@ public: bool opt_copy_propagation(); bool opt_algebraic(); bool opt_register_coalesce(); + void opt_set_dependency_control(); bool can_do_source_mods(vec4_instruction *inst); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp index cb49a042390..e378f7fd5f0 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp @@ -742,6 +742,8 @@ vec4_generator::generate_code(exec_list *instructions) brw_set_saturate(p, inst->saturate); brw_set_mask_control(p, inst->force_writemask_all); + unsigned pre_emit_nr_insn = p->nr_insn; + switch (inst->opcode) { case BRW_OPCODE_MOV: brw_MOV(p, dst, src[0]); @@ -868,6 +870,19 @@ vec4_generator::generate_code(exec_list *instructions) break; } + if (inst->no_dd_clear || inst->no_dd_check) { + assert(p->nr_insn == pre_emit_nr_insn + 1 || + !"no_dd_check or no_dd_clear set for IR emitting more " + "than 1 instruction"); + + struct brw_instruction *last = &p->store[pre_emit_nr_insn]; + + if (inst->no_dd_clear) + last->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED; + if (inst->no_dd_check) + last->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED; + } + if (unlikely(INTEL_DEBUG & DEBUG_VS)) { brw_dump_compile(p, stdout, last_native_insn_offset, p->next_insn_offset); -- 2.30.2