From: Eric Anholt Date: Fri, 22 Oct 2010 19:57:00 +0000 (-0700) Subject: i965: Add support for pull constants to the new FS backend. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=07cd8f46acc34b04308f81de2faf05ba33da264b;p=mesa.git i965: Add support for pull constants to the new FS backend. Fixes glsl-fs-uniform-array-5, but not 6 which fails in ir_to_mesa. --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index f205c07a727..4a0709b4468 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -173,8 +173,6 @@ struct brw_fragment_program { GLuint id; /**< serial no. to identify frag progs, never re-used */ GLboolean isGLSL; /**< really, any IF/LOOP/CONT/BREAK instructions */ - GLboolean use_const_buffer; - /** for debugging, which texture units are referenced */ GLbitfield tex_units_used; }; @@ -204,12 +202,14 @@ struct brw_wm_prog_data { GLuint total_scratch; GLuint nr_params; /**< number of float params/constants */ + GLuint nr_pull_params; GLboolean error; /* Pointer to tracked values (only valid once * _mesa_load_state_parameters has been called at runtime). */ - const GLfloat *param[BRW_MAX_CURBE]; + const GLfloat *param[MAX_UNIFORMS * 4]; /* should be: BRW_MAX_CURBE */ + const GLfloat *pull_param[MAX_UNIFORMS * 4]; }; struct brw_sf_prog_data { diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 0e3ccfa46c8..15c2f23d6a5 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -900,14 +900,19 @@ void brw_math2(struct brw_compile *p, void brw_oword_block_read(struct brw_compile *p, struct brw_reg dest, struct brw_reg mrf, - int num_regs, - GLuint offset); + uint32_t offset, + uint32_t bind_table_index); -void brw_dp_READ_4( struct brw_compile *p, - struct brw_reg dest, - GLboolean relAddr, - GLuint location, - GLuint bind_table_index ); +void brw_oword_block_read_scratch(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + GLuint offset); + +void brw_oword_block_write_scratch(struct brw_compile *p, + struct brw_reg mrf, + int num_regs, + GLuint offset); void brw_dp_READ_4_vs( struct brw_compile *p, struct brw_reg dest, @@ -920,11 +925,6 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p, GLuint offset, GLuint bind_table_index); -void brw_oword_block_write(struct brw_compile *p, - struct brw_reg mrf, - int num_regs, - GLuint offset); - /* If/else/endif. Works by manipulating the execution flags on each * channel. */ diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 6fbc39672f1..fe3a0299925 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -1359,10 +1359,10 @@ void brw_math_16( struct brw_compile *p, * The offset must be aligned to oword size (16 bytes). Used for * register spilling. */ -void brw_oword_block_write(struct brw_compile *p, - struct brw_reg mrf, - int num_regs, - GLuint offset) +void brw_oword_block_write_scratch(struct brw_compile *p, + struct brw_reg mrf, + int num_regs, + GLuint offset) { struct intel_context *intel = &p->brw->intel; uint32_t msg_control; @@ -1458,11 +1458,11 @@ void brw_oword_block_write(struct brw_compile *p, * spilling. */ void -brw_oword_block_read(struct brw_compile *p, - struct brw_reg dest, - struct brw_reg mrf, - int num_regs, - GLuint offset) +brw_oword_block_read_scratch(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + GLuint offset) { uint32_t msg_control; int rlen; @@ -1517,65 +1517,57 @@ brw_oword_block_read(struct brw_compile *p, } } - /** * Read a float[4] vector from the data port Data Cache (const buffer). * Location (in buffer) should be a multiple of 16. * Used for fetching shader constants. - * If relAddr is true, we'll do an indirect fetch using the address register. */ -void brw_dp_READ_4( struct brw_compile *p, - struct brw_reg dest, - GLboolean relAddr, - GLuint location, - GLuint bind_table_index ) +void brw_oword_block_read(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg mrf, + uint32_t offset, + uint32_t bind_table_index) { - /* XXX: relAddr not implemented */ - GLuint msg_reg_nr = 1; - { - struct brw_reg b; - brw_push_insn_state(p); - brw_set_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_compression_control(p, BRW_COMPRESSION_NONE); - brw_set_mask_control(p, BRW_MASK_DISABLE); + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); - /* Setup MRF[1] with location/offset into const buffer */ - b = brw_message_reg(msg_reg_nr); - b = retype(b, BRW_REGISTER_TYPE_UD); - /* XXX I think we're setting all the dwords of MRF[1] to 'location'. - * when the docs say only dword[2] should be set. Hmmm. But it works. - */ - brw_MOV(p, b, brw_imm_ud(location)); - brw_pop_insn_state(p); - } + brw_push_insn_state(p); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); - { - struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); - - insn->header.predicate_control = BRW_PREDICATE_NONE; - insn->header.compression_control = BRW_COMPRESSION_NONE; - insn->header.destreg__conditionalmod = msg_reg_nr; - insn->header.mask_control = BRW_MASK_DISABLE; - - /* cast dest to a uword[8] vector */ - dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - brw_set_dest(insn, dest); - brw_set_src0(insn, brw_null_reg()); + /* set message header global offset field (reg 0, element 2) */ + brw_MOV(p, + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); - brw_set_dp_read_message(p->brw, - insn, - bind_table_index, - 0, /* msg_control (0 means 1 Oword) */ - BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ - 0, /* source cache = data cache */ - 1, /* msg_length */ - 1, /* response_length (1 Oword) */ - 0); /* eot */ - } + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + insn->header.destreg__conditionalmod = mrf.nr; + + /* cast dest to a uword[8] vector */ + dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); + + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_null_reg()); + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW, + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, + 0, /* source cache = data cache */ + 1, /* msg_length */ + 1, /* response_length (1 reg, 2 owords!) */ + 0); /* eot */ + + brw_pop_insn_state(p); } + /** * Read float[4] constant(s) from VS constant buffer. * For relative addressing, two float[4] constants will be read into 'dest'. diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index bade5e4cc49..c687fde8738 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -286,6 +286,7 @@ fs_visitor::setup_uniform_values(int loc, const glsl_type *type) case GLSL_TYPE_BOOL: vec_values = fp->Base.Parameters->ParameterValues[loc]; for (unsigned int i = 0; i < type->vector_elements; i++) { + assert(c->prog_data.nr_params < ARRAY_SIZE(c->prog_data.param)); c->prog_data.param[c->prog_data.nr_params++] = &vec_values[i]; } return 1; @@ -2230,7 +2231,8 @@ fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), retype(src, BRW_REGISTER_TYPE_UD)); - brw_oword_block_write(p, brw_message_reg(inst->base_mrf), 1, inst->offset); + brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, + inst->offset); } void @@ -2251,8 +2253,39 @@ fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) if (intel->gen == 4 && !intel->is_g4x) brw_MOV(p, brw_null_reg(), dst); - brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1, - inst->offset); + brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, + inst->offset); + + if (intel->gen == 4 && !intel->is_g4x) { + /* gen4 errata: destination from a send can't be used as a + * destination until it's been read. Just read it so we don't + * have to worry. + */ + brw_MOV(p, brw_null_reg(), dst); + } +} + + +void +fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->mlen != 0); + + /* Clear any post destination dependencies that would be ignored by + * the block read. See the B-Spec for pre-gen5 send instruction. + * + * This could use a better solution, since texture sampling and + * math reads could potentially run into it as well -- anywhere + * that we have a SEND with a destination that is a register that + * was written but not read within the last N instructions (what's + * N? unsure). This is rare because of dead code elimination, but + * not impossible. + */ + if (intel->gen == 4 && !intel->is_g4x) + brw_MOV(p, brw_null_reg(), dst); + + brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), + inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); if (intel->gen == 4 && !intel->is_g4x) { /* gen4 errata: destination from a send can't be used as a @@ -2433,6 +2466,66 @@ fs_visitor::split_virtual_grfs() } } +/** + * Choose accesses from the UNIFORM file to demote to using the pull + * constant buffer. + * + * We allow a fragment shader to have more than the specified minimum + * maximum number of fragment shader uniform components (64). If + * there are too many of these, they'd fill up all of register space. + * So, this will push some of them out to the pull constant buffer and + * update the program to load them. + */ +void +fs_visitor::setup_pull_constants() +{ + /* Only allow 16 registers (128 uniform components) as push constants. */ + unsigned int max_uniform_components = 16 * 8; + if (c->prog_data.nr_params <= max_uniform_components) + return; + + /* Just demote the end of the list. We could probably do better + * here, demoting things that are rarely used in the program first. + */ + int pull_uniform_base = max_uniform_components; + int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; + + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + for (int i = 0; i < 3; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; + if (uniform_nr < pull_uniform_base) + continue; + + fs_reg dst = fs_reg(this, glsl_type::float_type); + fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, + dst); + pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; + pull->ir = inst->ir; + pull->annotation = inst->annotation; + pull->base_mrf = 14; + pull->mlen = 1; + + inst->insert_before(pull); + + inst->src[i].file = GRF; + inst->src[i].reg = dst.reg; + inst->src[i].reg_offset = 0; + inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; + } + } + + for (int i = 0; i < pull_uniform_count; i++) { + c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; + } + c->prog_data.nr_params -= pull_uniform_count; + c->prog_data.nr_pull_params = pull_uniform_count; +} + void fs_visitor::calculate_live_intervals() { @@ -2721,6 +2814,7 @@ fs_visitor::register_coalesce() scan_inst->src[i].reg_offset = inst->src[0].reg_offset; scan_inst->src[i].abs |= inst->src[0].abs; scan_inst->src[i].negate ^= inst->src[0].negate; + scan_inst->src[i].smear = inst->src[0].smear; } } } @@ -2749,7 +2843,7 @@ fs_visitor::compute_to_mrf() inst->predicated || inst->dst.file != MRF || inst->src[0].file != GRF || inst->dst.type != inst->src[0].type || - inst->src[0].abs || inst->src[0].negate) + inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) continue; /* Can't compute-to-MRF this GRF if someone else was going to @@ -2897,8 +2991,13 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) case GRF: case ARF: case MRF: - brw_reg = brw_vec8_reg(reg->file, - reg->hw_reg, 0); + if (reg->smear == -1) { + brw_reg = brw_vec8_reg(reg->file, + reg->hw_reg, 0); + } else { + brw_reg = brw_vec1_reg(reg->file, + reg->hw_reg, reg->smear); + } brw_reg = retype(brw_reg, reg->type); break; case IMM: @@ -3136,6 +3235,10 @@ fs_visitor::generate_code() generate_unspill(inst, dst); break; + case FS_OPCODE_PULL_CONSTANT_LOAD: + generate_pull_constant_load(inst, dst); + break; + case FS_OPCODE_FB_WRITE: generate_fb_write(inst); break; @@ -3221,6 +3324,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) v.emit_fb_writes(); v.split_virtual_grfs(); + v.setup_pull_constants(); v.assign_curb_setup(); v.assign_urb_setup(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b571c235305..9b7fcde8580 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -76,6 +76,7 @@ enum fs_opcodes { FS_OPCODE_DISCARD_AND, FS_OPCODE_SPILL, FS_OPCODE_UNSPILL, + FS_OPCODE_PULL_CONSTANT_LOAD, }; @@ -100,6 +101,7 @@ public: this->negate = 0; this->abs = 0; this->hw_reg = -1; + this->smear = -1; } /** Generic unset register constructor. */ @@ -162,6 +164,7 @@ public: bool negate; bool abs; struct brw_reg fixed_hw_reg; + int smear; /* -1, or a channel of the reg to smear to all channels. */ /** Value for file == BRW_IMMMEDIATE_FILE */ union { @@ -366,6 +369,7 @@ public: int choose_spill_reg(struct ra_graph *g); void spill_reg(int spill_reg); void split_virtual_grfs(); + void setup_pull_constants(); void calculate_live_intervals(); bool propagate_constants(); bool register_coalesce(); @@ -384,6 +388,7 @@ public: void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src); void generate_spill(fs_inst *inst, struct brw_reg src); void generate_unspill(fs_inst *inst, struct brw_reg dst); + void generate_pull_constant_load(fs_inst *inst, struct brw_reg dst); void emit_dummy_fs(); fs_reg *emit_fragcoord_interpolation(ir_variable *ir); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index b5bfd00d5fe..d7acc30fc46 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -401,6 +401,7 @@ fs_visitor::spill_reg(int spill_reg) spill_src.reg_offset = 0; spill_src.abs = false; spill_src.negate = false; + spill_src.smear = -1; for (int chan = 0; chan < size; chan++) { fs_inst *spill_inst = new(mem_ctx) fs_inst(FS_OPCODE_SPILL, diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c index 88bc64e5dd9..d06c49fd5be 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_emit.c +++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c @@ -1576,7 +1576,7 @@ static void emit_spill( struct brw_wm_compile *c, mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask } send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 } */ - brw_oword_block_write(p, brw_message_reg(1), 2, slot); + brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot); } diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c index 55aceea9b5c..d325f85ce00 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c +++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c @@ -307,21 +307,20 @@ static void prealloc_reg(struct brw_wm_compile *c) /* use a real constant buffer, or just use a section of the GRF? */ /* XXX this heuristic may need adjustment... */ - if ((nr_params + nr_temps) * 4 + reg_index > 80) - c->fp->use_const_buffer = GL_TRUE; - else - c->fp->use_const_buffer = GL_FALSE; - /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/ + if ((nr_params + nr_temps) * 4 + reg_index > 80) { + for (i = 0; i < nr_params; i++) { + float *pv = c->fp->program.Base.Parameters->ParameterValues[i]; + for (j = 0; j < 4; j++) { + c->prog_data.pull_param[c->prog_data.nr_pull_params] = &pv[j]; + c->prog_data.nr_pull_params++; + } + } - if (c->fp->use_const_buffer) { - /* We'll use a real constant buffer and fetch constants from - * it with a dataport read message. - */ + c->prog_data.nr_params = 0; + } + /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/ - /* number of float constants in CURBE */ - c->prog_data.nr_params = 0; - } - else { + if (!c->prog_data.nr_pull_params) { const struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters; int index = 0; @@ -463,7 +462,7 @@ static void prealloc_reg(struct brw_wm_compile *c) * They'll be found in these registers. * XXX alloc these on demand! */ - if (c->fp->use_const_buffer) { + if (c->prog_data.nr_pull_params) { for (i = 0; i < 3; i++) { c->current_const[i].index = -1; c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0); @@ -501,12 +500,11 @@ static void fetch_constants(struct brw_wm_compile *c, #endif /* need to fetch the constant now */ - brw_dp_READ_4(p, - c->current_const[i].reg, /* writeback dest */ - src->RelAddr, /* relative indexing? */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */ - ); + brw_oword_block_read(p, + c->current_const[i].reg, + brw_message_reg(1), + 16 * src->Index, + SURF_INDEX_FRAG_CONST_BUFFER); } } } @@ -606,7 +604,7 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c, } } - if (c->fp->use_const_buffer && + if (c->prog_data.nr_pull_params && (src->File == PROGRAM_STATE_VAR || src->File == PROGRAM_CONSTANT || src->File == PROGRAM_UNIFORM)) { @@ -729,7 +727,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) #endif /* fetch any constants that this instruction needs */ - if (c->fp->use_const_buffer) + if (c->prog_data.nr_pull_params) fetch_constants(c, inst); if (inst->Opcode != OPCODE_ARL) { diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 5588702afc3..dd5ddea9204 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -319,13 +319,14 @@ prepare_wm_constants(struct brw_context *brw) struct intel_context *intel = &brw->intel; struct brw_fragment_program *fp = (struct brw_fragment_program *) brw->fragment_program; - const struct gl_program_parameter_list *params = fp->program.Base.Parameters; - const int size = params->NumParameters * 4 * sizeof(GLfloat); + const int size = brw->wm.prog_data->nr_pull_params * sizeof(float); + float *constants; + unsigned int i; _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); /* BRW_NEW_FRAGMENT_PROGRAM */ - if (!fp->use_const_buffer) { + if (brw->wm.prog_data->nr_pull_params == 0) { if (brw->wm.const_bo) { drm_intel_bo_unreference(brw->wm.const_bo); brw->wm.const_bo = NULL; @@ -335,11 +336,18 @@ prepare_wm_constants(struct brw_context *brw) } drm_intel_bo_unreference(brw->wm.const_bo); - brw->wm.const_bo = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", + brw->wm.const_bo = drm_intel_bo_alloc(intel->bufmgr, "WM const bo", size, 64); /* _NEW_PROGRAM_CONSTANTS */ - drm_intel_bo_subdata(brw->wm.const_bo, 0, size, params->ParameterValues); + drm_intel_gem_bo_map_gtt(brw->wm.const_bo); + constants = brw->wm.const_bo->virtual; + for (i = 0; i < brw->wm.prog_data->nr_pull_params; i++) { + constants[i] = *brw->wm.prog_data->pull_param[i]; + } + drm_intel_gem_bo_unmap_gtt(brw->wm.const_bo); + + brw->state.dirty.brw |= BRW_NEW_WM_CONSTBUF; } const struct brw_tracked_state brw_wm_constants = { diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 21059be9657..ce489f06862 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -87,7 +87,7 @@ upload_wm_state(struct brw_context *brw) brw_fragment_program_const(brw->fragment_program); uint32_t dw2, dw4, dw5, dw6; - if (fp->use_const_buffer || brw->wm.prog_data->nr_params == 0) { + if (brw->wm.prog_data->nr_params == 0) { /* Disable the push constant buffers. */ BEGIN_BATCH(5); OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 | (5 - 2));