From 483f5b348b0f3c0ca7082fd2047c354e8af285e7 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 22 Aug 2011 10:35:24 -0700 Subject: [PATCH] i965/vs: Add support for pull constant loads for uniform arrays. v2: reworked the instruction emit and made use of gen6_resolve_implied_move, from Ken's review --- src/mesa/drivers/dri/i965/brw_defines.h | 1 + src/mesa/drivers/dri/i965/brw_eu.h | 3 + src/mesa/drivers/dri/i965/brw_eu_emit.c | 2 +- src/mesa/drivers/dri/i965/brw_vec4.h | 11 ++ src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 46 +++++- .../drivers/dri/i965/brw_vec4_visitor.cpp | 137 +++++++++++++++++- 6 files changed, 195 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index d1799c0ab4f..5f349392ae9 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -643,6 +643,7 @@ enum opcode { VS_OPCODE_URB_WRITE, VS_OPCODE_SCRATCH_READ, VS_OPCODE_SCRATCH_WRITE, + VS_OPCODE_PULL_CONSTANT_LOAD, }; #define BRW_PREDICATE_NONE 0 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index af50305fc2b..31334ce8c49 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -807,6 +807,9 @@ void brw_set_dest(struct brw_compile *p, struct brw_instruction *insn, void brw_set_src0(struct brw_compile *p, struct brw_instruction *insn, struct brw_reg reg); +void gen6_resolve_implied_move(struct brw_compile *p, + struct brw_reg *src, + GLuint msg_reg_nr); /* Helpers for regular instructions: */ diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index e8d09983405..f9f8d49a0d0 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -58,7 +58,7 @@ static void guess_execution_size(struct brw_compile *p, * On Sandybridge, this is no longer the case. This function performs the * explicit move; it should be called before emitting a SEND instruction. */ -static void +void gen6_resolve_implied_move(struct brw_compile *p, struct brw_reg *src, GLuint msg_reg_nr) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index c6071544f61..1bb15016b52 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -370,6 +370,7 @@ public: */ dst_reg output_reg[VERT_RESULT_MAX]; int uniform_size[MAX_UNIFORMS]; + int uniform_vector_size[MAX_UNIFORMS]; int uniforms; struct hash_table *variable_ht; @@ -386,6 +387,7 @@ public: void reg_allocate_trivial(); void reg_allocate(); void move_grf_array_access_to_scratch(); + void move_uniform_array_access_to_pull_constants(); void calculate_live_intervals(); bool dead_code_eliminate(); bool virtual_grf_interferes(int a, int b); @@ -447,6 +449,8 @@ public: src_reg get_scratch_offset(vec4_instruction *inst, src_reg *reladdr, int reg_offset); + src_reg get_pull_constant_offset(vec4_instruction *inst, + src_reg *reladdr, int reg_offset); void emit_scratch_read(vec4_instruction *inst, dst_reg dst, src_reg orig_src, @@ -455,6 +459,10 @@ public: src_reg temp, dst_reg orig_dst, int base_offset); + void emit_pull_constant_load(vec4_instruction *inst, + dst_reg dst, + src_reg orig_src, + int base_offset); GLboolean try_emit_sat(ir_expression *ir); @@ -490,6 +498,9 @@ public: void generate_scratch_read(vec4_instruction *inst, struct brw_reg dst, struct brw_reg index); + void generate_pull_constant_load(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index); }; } /* namespace brw */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp index 11d0278ccd0..49514070f34 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp @@ -114,7 +114,7 @@ vec4_visitor::setup_uniforms(int reg) * matter what, or the GPU would hang. */ if (intel->gen < 6 && this->uniforms == 0) { - this->uniform_size[this->uniforms] = 1; + this->uniform_vector_size[this->uniforms] = 1; for (unsigned int i = 0; i < 4; i++) { unsigned int slot = this->uniforms * 4 + i; @@ -229,6 +229,9 @@ vec4_instruction::get_src(int i) brw_reg = brw_abs(brw_reg); if (src[i].negate) brw_reg = negate(brw_reg); + + /* This should have been moved to pull constants. */ + assert(!src[i].reladdr); break; case HW_REG: @@ -487,6 +490,42 @@ vec4_visitor::generate_scratch_write(vec4_instruction *inst, false /* commit */); } +void +vec4_visitor::generate_pull_constant_load(vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg index) +{ + struct brw_reg header = brw_vec8_grf(0, 0); + + gen6_resolve_implied_move(p, &header, inst->base_mrf); + + brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D), + index); + + uint32_t msg_type; + + if (intel->gen >= 6) + msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else if (intel->gen == 5 || intel->is_g4x) + msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else + msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + + /* Each of the 8 channel enables is considered for whether each + * dword is written. + */ + struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + brw_set_dp_read_message(p, send, + SURF_INDEX_VERT_CONST_BUFFER, + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + msg_type, + BRW_DATAPORT_READ_TARGET_DATA_CACHE, + 2, /* mlen */ + 1 /* rlen */); +} + void vec4_visitor::generate_vs_instruction(vec4_instruction *instruction, struct brw_reg dst, @@ -529,6 +568,10 @@ vec4_visitor::generate_vs_instruction(vec4_instruction *instruction, generate_scratch_write(inst, dst, src[0], src[1]); break; + case VS_OPCODE_PULL_CONSTANT_LOAD: + generate_pull_constant_load(inst, dst, src[0]); + break; + default: if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { fail("unsupported opcode in `%s' in VS\n", @@ -556,6 +599,7 @@ vec4_visitor::run() * often do repeated subexpressions for those. */ move_grf_array_access_to_scratch(); + move_uniform_array_access_to_pull_constants(); bool progress; do { diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 912145538a6..c4a3bbadd40 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -381,7 +381,10 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type) c->prog_data.param[this->uniforms * 4 + i] = &zero; } - this->uniform_size[this->uniforms] = type->vector_elements; + /* Track the size of this uniform vector, for future packing of + * uniforms. + */ + this->uniform_vector_size[this->uniforms] = type->vector_elements; this->uniforms++; return 1; @@ -429,7 +432,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir) (gl_state_index *)slots[i].tokens); float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f; - this->uniform_size[this->uniforms] = 0; + this->uniform_vector_size[this->uniforms] = 0; /* Add each of the unique swizzled channels of the element. * This will end up matching the size of the glsl_type of this field. */ @@ -440,7 +443,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir) c->prog_data.param[this->uniforms * 4 + j] = &values[swiz]; if (swiz <= last_swiz) - this->uniform_size[this->uniforms]++; + this->uniform_vector_size[this->uniforms]++; } this->uniforms++; } @@ -677,6 +680,11 @@ vec4_visitor::visit(ir_variable *ir) case ir_var_uniform: reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms); + /* Track how big the whole uniform variable is, in case we need to put a + * copy of its data into pull constants for array access. + */ + this->uniform_size[this->uniforms] = type_size(ir->type); + if (!strncmp(ir->name, "gl_", 3)) { setup_builtin_uniform_values(ir); } else { @@ -1947,6 +1955,42 @@ vec4_visitor::get_scratch_offset(vec4_instruction *inst, } } +src_reg +vec4_visitor::get_pull_constant_offset(vec4_instruction *inst, + src_reg *reladdr, int reg_offset) +{ + if (reladdr) { + src_reg index = src_reg(this, glsl_type::int_type); + + vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD, + dst_reg(index), + *reladdr, + src_reg(reg_offset)); + add->ir = inst->ir; + add->annotation = inst->annotation; + inst->insert_before(add); + + /* Pre-gen6, the message header uses byte offsets instead of vec4 + * (16-byte) offset units. + */ + if (intel->gen < 6) { + vec4_instruction *mul = new(mem_ctx) vec4_instruction(this, + BRW_OPCODE_MUL, + dst_reg(index), + index, + src_reg(16)); + mul->ir = inst->ir; + mul->annotation = inst->annotation; + inst->insert_before(mul); + } + + return index; + } else { + int message_header_scale = intel->gen < 6 ? 16 : 1; + return src_reg(reg_offset * message_header_scale); + } +} + /** * Emits an instruction before @inst to load the value named by @orig_src * from scratch space at @base_offset to @temp. @@ -2072,6 +2116,93 @@ vec4_visitor::move_grf_array_access_to_scratch() } } +/** + * Emits an instruction before @inst to load the value named by @orig_src + * from the pull constant buffer (surface) at @base_offset to @temp. + */ +void +vec4_visitor::emit_pull_constant_load(vec4_instruction *inst, + dst_reg temp, src_reg orig_src, + int base_offset) +{ + int reg_offset = base_offset + orig_src.reg_offset; + src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset); + vec4_instruction *load; + + load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD, + temp, index); + load->annotation = inst->annotation; + load->ir = inst->ir; + load->base_mrf = 14; + load->mlen = 1; + inst->insert_before(load); +} + +/** + * Implements array access of uniforms by inserting a + * PULL_CONSTANT_LOAD instruction. + * + * Unlike temporary GRF array access (where we don't support it due to + * the difficulty of doing relative addressing on instruction + * destinations), we could potentially do array access of uniforms + * that were loaded in GRF space as push constants. In real-world + * usage we've seen, though, the arrays being used are always larger + * than we could load as push constants, so just always move all + * uniform array access out to a pull constant buffer. + */ +void +vec4_visitor::move_uniform_array_access_to_pull_constants() +{ + int pull_constant_loc[this->uniforms]; + + for (int i = 0; i < this->uniforms; i++) { + pull_constant_loc[i] = -1; + } + + /* Walk through and find array access of uniforms. Put a copy of that + * uniform in the pull constant buffer. + * + * Note that we don't move constant-indexed accesses to arrays. No + * testing has been done of the performance impact of this choice. + */ + foreach_list_safe(node, &this->instructions) { + vec4_instruction *inst = (vec4_instruction *)node; + + for (int i = 0 ; i < 3; i++) { + if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr) + continue; + + int uniform = inst->src[i].reg; + + /* If this array isn't already present in the pull constant buffer, + * add it. + */ + if (pull_constant_loc[uniform] == -1) { + const float **values = &prog_data->param[uniform * 4]; + + pull_constant_loc[uniform] = prog_data->nr_pull_params; + + for (int j = 0; j < uniform_size[uniform] * 4; j++) { + prog_data->pull_param[prog_data->nr_pull_params++] = values[j]; + } + } + + /* Set up the annotation tracking for new generated instructions. */ + base_ir = inst->ir; + current_annotation = inst->annotation; + + dst_reg temp = dst_reg(this, glsl_type::vec4_type); + + emit_pull_constant_load(inst, temp, inst->src[i], + pull_constant_loc[uniform]); + + inst->src[i].file = temp.file; + inst->src[i].reg = temp.reg; + inst->src[i].reg_offset = temp.reg_offset; + inst->src[i].reladdr = NULL; + } + } +} vec4_visitor::vec4_visitor(struct brw_vs_compile *c, struct gl_shader_program *prog, -- 2.30.2