From 42ad2f0b9b6a18f1613f6d915a46b4a4a89c5aa2 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 14 Mar 2011 10:29:12 -0700 Subject: [PATCH] i965/fs: Add support for 16-wide dispatch on gen5. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_fs.cpp | 73 +++++++++++++++++++++--- src/mesa/drivers/dri/i965/brw_fs.h | 2 + src/mesa/drivers/dri/i965/brw_wm_state.c | 30 +++++++++- 3 files changed, 93 insertions(+), 12 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 8785957b6e6..4e3adbc0a69 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2039,6 +2039,59 @@ fs_visitor::emit_interpolation_setup_gen6() this->current_annotation = NULL; } +void +fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color) +{ + int reg_width = c->dispatch_width / 8; + + if (c->dispatch_width == 8 || intel->gen == 6) { + /* SIMD8 write looks like: + * m + 0: r0 + * m + 1: r1 + * m + 2: g0 + * m + 3: g1 + * + * gen6 SIMD16 DP write looks like: + * m + 0: r0 + * m + 1: r1 + * m + 2: g0 + * m + 3: g1 + * m + 4: b0 + * m + 5: b1 + * m + 6: a0 + * m + 7: a1 + */ + emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width), + color); + } else { + /* pre-gen6 SIMD16 single source DP write looks like: + * m + 0: r0 + * m + 1: g0 + * m + 2: b0 + * m + 3: a0 + * m + 4: r1 + * m + 5: g1 + * m + 6: b1 + * m + 7: a1 + * + * By setting the high bit of the MRF register number, + * we could indicate that we want COMPR4 mode - instead + * of doing the usual destination + 1 for the second + * half we would get destination + 4. We would need to + * clue the optimizer into that, though. + */ + push_force_uncompressed(); + emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color); + pop_force_uncompressed(); + + push_force_sechalf(); + color.sechalf = true; + emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color); + pop_force_sechalf(); + color.sechalf = false; + } +} + void fs_visitor::emit_fb_writes() { @@ -2113,7 +2166,7 @@ fs_visitor::emit_fb_writes() target); if (this->frag_color || this->frag_data) { for (int i = 0; i < 4; i++) { - emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color); + emit_color_write(i, color_mrf, color); color.reg_offset++; } } @@ -2137,7 +2190,7 @@ fs_visitor::emit_fb_writes() * renderbuffer. */ color.reg_offset += 3; - emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color); + emit_color_write(3, color_mrf, color); } fs_inst *inst = emit(FS_OPCODE_FB_WRITE); @@ -2330,7 +2383,7 @@ fs_visitor::generate_math(fs_inst *inst, brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); } } - } else { + } else /* gen <= 5 */{ assert(inst->mlen >= 1); brw_set_compression_control(p, BRW_COMPRESSION_NONE); @@ -2351,6 +2404,7 @@ fs_visitor::generate_math(fs_inst *inst, inst->base_mrf + 1, sechalf(src[0]), BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); + brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); } } @@ -3528,6 +3582,8 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) reg->hw_reg, reg->smear); } brw_reg = retype(brw_reg, reg->type); + if (reg->sechalf) + brw_reg = sechalf(brw_reg); break; case IMM: switch (reg->type) { @@ -3881,7 +3937,7 @@ fs_visitor::run() /* Haven't hooked in support for uniforms through the 16-wide * version yet. */ - return GL_FALSE; + return false; } /* align to 64 byte boundary. */ @@ -3957,11 +4013,10 @@ fs_visitor::run() assert(force_uncompressed_stack == 0); assert(force_sechalf_stack == 0); - if (!failed) - generate_code(); - if (failed) - return GL_FALSE; + return false; + + generate_code(); if (c->dispatch_width == 8) { c->prog_data.total_grf = grf_used; @@ -4005,7 +4060,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) return false; } - if (intel->gen >= 6) { + if (intel->gen >= 5) { c->dispatch_width = 16; fs_visitor v2(c, shader); v2.run(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b158992071e..60398ac870e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -178,6 +178,7 @@ public: int type; bool negate; bool abs; + bool sechalf; struct brw_reg fixed_hw_reg; int smear; /* -1, or a channel of the reg to smear to all channels. */ @@ -521,6 +522,7 @@ public: void emit_if_gen6(ir_if *ir); void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset); + void emit_color_write(int index, int first_color_mrf, fs_reg color); void emit_fb_writes(); void emit_assignment_writes(fs_reg &l, fs_reg &r, const glsl_type *type, bool predicated); diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index be4b260a5ff..9d0a7a8d27d 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -41,10 +41,11 @@ */ struct brw_wm_unit_key { - unsigned int total_grf, total_scratch; + unsigned int total_grf, total_grf_16, total_scratch; unsigned int urb_entry_read_length; unsigned int curb_entry_read_length; unsigned int dispatch_grf_start_reg; + uint32_t prog_offset_16; unsigned int curbe_offset; @@ -92,10 +93,21 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) /* CACHE_NEW_WM_PROG */ key->total_grf = brw->wm.prog_data->total_grf; + key->total_grf_16 = brw->wm.prog_data->total_grf_16; key->urb_entry_read_length = brw->wm.prog_data->urb_read_length; key->curb_entry_read_length = brw->wm.prog_data->curb_read_length; key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf; key->total_scratch = brw->wm.prog_data->total_scratch; + key->prog_offset_16 = brw->wm.prog_data->prog_offset_16; + + if (key->prog_offset_16) { + /* These two fields should be the same pre-gen6, which is why we + * only have one hardware field to program for both dispatch + * widths. + */ + assert(brw->wm.prog_data->first_curbe_grf == + brw->wm.prog_data->first_curbe_grf_16); + } /* BRW_NEW_CURBE_OFFSETS */ key->curbe_offset = brw->curbe.wm_start; @@ -166,7 +178,10 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, memset(&wm, 0, sizeof(wm)); wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + wm.wm9.grf_reg_count_2 = ALIGN(key->total_grf_16, 16) / 16 - 1; wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */ + wm.wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset + + key->prog_offset_16) >> 6; /* reloc */ wm.thread1.depth_coef_urb_read_offset = 1; wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; @@ -206,9 +221,11 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, wm.wm5.program_computes_depth = key->computes_depth; wm.wm5.program_uses_killpixel = key->uses_kill; - if (key->is_glsl) + if (key->is_glsl) { wm.wm5.enable_8_pix = 1; - else + if (key->prog_offset_16) + wm.wm5.enable_16_pix = 1; + } else wm.wm5.enable_16_pix = 1; wm.wm5.max_threads = brw->wm_max_threads - 1; @@ -256,6 +273,13 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, brw->wm.prog_bo, wm.thread0.grf_reg_count << 1, I915_GEM_DOMAIN_INSTRUCTION, 0); + if (key->prog_offset_16) { + drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, wm9), + brw->wm.prog_bo, ((wm.wm9.grf_reg_count_2 << 1) + + key->prog_offset_16), + I915_GEM_DOMAIN_INSTRUCTION, 0); + } + /* Emit scratch space relocation */ if (key->total_scratch != 0) { drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2), -- 2.30.2