From c9e48e5b083b6cf97ecdb2d17c874ea631203b06 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 1 Aug 2012 19:35:18 -0700 Subject: [PATCH] i965: Generalize VS compute-to-MRF for compute-to-another-GRF, too. No statistically significant performance difference on glbenchmark 2.7 (n=60). It reduces cycles spent in the vertex shader by 3.3% +/- 0.8% (n=5), but that's only about .3% of all cycles spent according to the fixed shader_time. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 129 ++++++++++-------- src/mesa/drivers/dri/i965/brw_vec4.h | 2 +- .../dri/i965/test_vec4_register_coalesce.cpp | 58 +++++++- 3 files changed, 128 insertions(+), 61 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 7ab37e7ca9f..079bbab51ec 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -680,12 +680,12 @@ vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle) } /* - * Tries to reduce extra MOV instructions by taking GRFs that get just - * written and then MOVed into an MRF and making the original write of - * the GRF write directly to the MRF instead. + * Tries to reduce extra MOV instructions by taking temporary GRFs that get + * just written and then MOVed into another reg and making the original write + * of the GRF write directly to the final destination instead. */ bool -vec4_visitor::opt_compute_to_mrf() +vec4_visitor::opt_register_coalesce() { bool progress = false; int next_ip = 0; @@ -699,24 +699,25 @@ vec4_visitor::opt_compute_to_mrf() next_ip++; if (inst->opcode != BRW_OPCODE_MOV || + (inst->dst.file != GRF && inst->dst.file != MRF) || inst->predicate || - inst->dst.file != MRF || inst->src[0].file != GRF || + inst->src[0].file != GRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) continue; - int mrf = inst->dst.reg; + bool to_mrf = (inst->dst.file == MRF); - /* Can't compute-to-MRF this GRF if someone else was going to + /* Can't coalesce this GRF if someone else was going to * read it later. */ if (this->virtual_grf_use[inst->src[0].reg] > ip) continue; - /* We need to check interference with the MRF between this - * instruction and the earliest instruction involved in writing - * the GRF we're eliminating. To do that, keep track of which - * of our source channels we've seen initialized. + /* We need to check interference with the final destination between this + * instruction and the earliest instruction involved in writing the GRF + * we're eliminating. To do that, keep track of which of our source + * channels we've seen initialized. */ bool chans_needed[4] = {false, false, false, false}; int chans_remaining = 0; @@ -735,8 +736,9 @@ vec4_visitor::opt_compute_to_mrf() } } - /* Now walk up the instruction stream trying to see if we can - * rewrite everything writing to the GRF into the MRF instead. + /* Now walk up the instruction stream trying to see if we can rewrite + * everything writing to the temporary to write into the destination + * instead. */ vec4_instruction *scan_inst; for (scan_inst = (vec4_instruction *)inst->prev; @@ -745,22 +747,21 @@ vec4_visitor::opt_compute_to_mrf() if (scan_inst->dst.file == GRF && scan_inst->dst.reg == inst->src[0].reg && scan_inst->dst.reg_offset == inst->src[0].reg_offset) { - /* Found something writing to the reg we want to turn into - * a compute-to-MRF. - */ - - /* SEND instructions can't have MRF as a destination. */ - if (scan_inst->mlen) - break; - - if (intel->gen >= 6) { - /* gen6 math instructions must have the destination be - * GRF, so no compute-to-MRF for them. - */ - if (scan_inst->is_math()) { - break; - } - } + /* Found something writing to the reg we want to coalesce away. */ + if (to_mrf) { + /* SEND instructions can't have MRF as a destination. */ + if (scan_inst->mlen) + break; + + if (intel->gen >= 6) { + /* gen6 math instructions must have the destination be + * GRF, so no compute-to-MRF for them. + */ + if (scan_inst->is_math()) { + break; + } + } + } /* If we can't handle the swizzle, bail. */ if (!scan_inst->can_reswizzle_dst(inst->dst.writemask, @@ -784,9 +785,8 @@ vec4_visitor::opt_compute_to_mrf() break; } - /* We don't handle flow control here. Most computation of - * values that end up in MRFs are shortly before the MRF - * write anyway. + /* We don't handle flow control here. Most computation of values + * that could be coalesced happens just before their use. */ if (scan_inst->opcode == BRW_OPCODE_DO || scan_inst->opcode == BRW_OPCODE_WHILE || @@ -795,9 +795,11 @@ vec4_visitor::opt_compute_to_mrf() break; } - /* You can't read from an MRF, so if someone else reads our - * MRF's source GRF that we wanted to rewrite, that stops us. - */ + /* You can't read from an MRF, so if someone else reads our MRF's + * source GRF that we wanted to rewrite, that stops us. If it's a + * GRF we're trying to coalesce to, we don't actually handle + * rewriting sources so bail in that case as well. + */ bool interfered = false; for (int i = 0; i < 3; i++) { if (scan_inst->src[i].file == GRF && @@ -809,30 +811,41 @@ vec4_visitor::opt_compute_to_mrf() if (interfered) break; - /* If somebody else writes our MRF here, we can't - * compute-to-MRF before that. - */ - if (scan_inst->dst.file == MRF && mrf == scan_inst->dst.reg) + /* If somebody else writes our destination here, we can't coalesce + * before that. + */ + if (scan_inst->dst.file == inst->dst.file && + scan_inst->dst.reg == inst->dst.reg) { break; + } - if (scan_inst->mlen > 0) { - /* Found a SEND instruction, which means that there are - * live values in MRFs from base_mrf to base_mrf + - * scan_inst->mlen - 1. Don't go pushing our MRF write up - * above it. - */ - if (mrf >= scan_inst->base_mrf && - mrf < scan_inst->base_mrf + scan_inst->mlen) { - break; - } - } + /* Check for reads of the register we're trying to coalesce into. We + * can't go rewriting instructions above that to put some other value + * in the register instead. + */ + if (to_mrf && scan_inst->mlen > 0) { + if (inst->dst.reg >= scan_inst->base_mrf && + inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) { + break; + } + } else { + for (int i = 0; i < 3; i++) { + if (scan_inst->src[i].file == inst->dst.file && + scan_inst->src[i].reg == inst->dst.reg && + scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { + interfered = true; + } + } + if (interfered) + break; + } } if (chans_remaining == 0) { - /* If we've made it here, we have an inst we want to - * compute-to-MRF, and a scan_inst pointing to the earliest - * instruction involved in computing the value. Now go - * rewrite the instruction stream between the two. + /* If we've made it here, we have an MOV we want to coalesce out, and + * a scan_inst pointing to the earliest instruction involved in + * computing the value. Now go rewrite the instruction stream + * between the two. */ while (scan_inst != inst) { @@ -841,9 +854,9 @@ vec4_visitor::opt_compute_to_mrf() scan_inst->dst.reg_offset == inst->src[0].reg_offset) { scan_inst->reswizzle_dst(inst->dst.writemask, inst->src[0].swizzle); - scan_inst->dst.file = MRF; - scan_inst->dst.reg = mrf; - scan_inst->dst.reg_offset = 0; + scan_inst->dst.file = inst->dst.file; + scan_inst->dst.reg = inst->dst.reg; + scan_inst->dst.reg_offset = inst->dst.reg_offset; scan_inst->saturate |= inst->saturate; } scan_inst = (vec4_instruction *)scan_inst->next; @@ -1277,7 +1290,7 @@ vec4_visitor::run() progress = dead_code_eliminate() || progress; progress = opt_copy_propagation() || progress; progress = opt_algebraic() || progress; - progress = opt_compute_to_mrf() || progress; + progress = opt_register_coalesce() || progress; } while (progress); diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 6da44d4080a..359a5aed041 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -334,7 +334,7 @@ public: bool virtual_grf_interferes(int a, int b); bool opt_copy_propagation(); bool opt_algebraic(); - bool opt_compute_to_mrf(); + bool opt_register_coalesce(); vec4_instruction *emit(vec4_instruction *inst); diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp index fa9c155655f..45be376fed2 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp @@ -70,7 +70,7 @@ _register_coalesce(vec4_visitor *v, const char *func) v->dump_instructions(); } - v->opt_compute_to_mrf(); + v->opt_register_coalesce(); if (print) { printf("%s: instructions after:\n", func); @@ -78,7 +78,7 @@ _register_coalesce(vec4_visitor *v, const char *func) } } -TEST_F(register_coalesce_test, test_easy_success) +TEST_F(register_coalesce_test, test_compute_to_mrf) { src_reg something = src_reg(v, glsl_type::float_type); dst_reg temp = dst_reg(v, glsl_type::float_type); @@ -143,3 +143,57 @@ TEST_F(register_coalesce_test, test_dp4_mrf) EXPECT_EQ(dp4->dst.file, MRF); EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); } + +TEST_F(register_coalesce_test, test_dp4_grf) +{ + src_reg some_src_1 = src_reg(v, glsl_type::vec4_type); + src_reg some_src_2 = src_reg(v, glsl_type::vec4_type); + dst_reg init; + + dst_reg to = dst_reg(v, glsl_type::vec4_type); + dst_reg temp = dst_reg(v, glsl_type::float_type); + + vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2)); + to.writemask = WRITEMASK_Y; + v->emit(v->MOV(to, src_reg(temp))); + + /* if we don't do something with the result, the automatic dead code + * elimination will remove all our instructions. + */ + src_reg src = src_reg(to); + src.negate = true; + v->emit(v->MOV(dst_reg(MRF, 0), src)); + + register_coalesce(v); + + EXPECT_EQ(dp4->dst.reg, to.reg); + EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); +} + +TEST_F(register_coalesce_test, test_channel_mul_grf) +{ + src_reg some_src_1 = src_reg(v, glsl_type::vec4_type); + src_reg some_src_2 = src_reg(v, glsl_type::vec4_type); + dst_reg init; + + dst_reg to = dst_reg(v, glsl_type::vec4_type); + dst_reg temp = dst_reg(v, glsl_type::float_type); + + vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2)); + to.writemask = WRITEMASK_Y; + v->emit(v->MOV(to, src_reg(temp))); + + /* if we don't do something with the result, the automatic dead code + * elimination will remove all our instructions. + */ + src_reg src = src_reg(to); + src.negate = true; + v->emit(v->MOV(dst_reg(MRF, 0), src)); + + register_coalesce(v); + + /* This path isn't supported yet in the reswizzling code, so we're checking + * that we haven't done anything bad to scalar non-DP[234]s. + */ + EXPECT_NE(mul->dst.reg, to.reg); +} -- 2.30.2