From: Eric Anholt Date: Sat, 12 Mar 2011 03:19:01 +0000 (-0800) Subject: i965/fs: Add initial support for 16-wide dispatch on gen6. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=662f1b48bd1a02907bb42ecda889a3aa52a5755d;p=mesa.git i965/fs: Add initial support for 16-wide dispatch on gen6. At this point it doesn't do uniforms, which have to be laid out the same between 8 and 16. Other than that, it supports everything but flow control, which was the thing that forced us to choose 8-wide for general GLSL support. Reviewed-by: Kenneth Graunke --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 1daa49abfb3..6bf8a1c83c7 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -204,13 +204,16 @@ struct brw_wm_prog_data { GLuint urb_read_length; GLuint first_curbe_grf; + GLuint first_curbe_grf_16; GLuint total_grf; + GLuint total_grf_16; GLuint total_scratch; GLuint nr_params; /**< number of float params/constants */ GLuint nr_pull_params; GLboolean error; int dispatch_width; + uint32_t prog_offset_16; /* Pointer to tracked values (only valid once * _mesa_load_state_parameters has been called at runtime). diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index bb71463bebc..8785957b6e6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...) } } +void +fs_visitor::push_force_uncompressed() +{ + force_uncompressed_stack++; +} + +void +fs_visitor::pop_force_uncompressed() +{ + force_uncompressed_stack--; + assert(force_uncompressed_stack >= 0); +} + +void +fs_visitor::push_force_sechalf() +{ + force_sechalf_stack++; +} + +void +fs_visitor::pop_force_sechalf() +{ + force_sechalf_stack--; + assert(force_sechalf_stack >= 0); +} + /** * Returns how many MRFs an FS opcode will write over. * @@ -1738,6 +1764,10 @@ fs_visitor::visit(ir_if *ir) { fs_inst *inst; + if (c->dispatch_width == 16) { + fail("Can't support (non-uniform) control flow on 16-wide\n"); + } + /* Don't point the annotation at the if statement, because then it plus * the then and else blocks get printed. */ @@ -1778,6 +1808,10 @@ fs_visitor::visit(ir_loop *ir) { fs_reg counter = reg_undef; + if (c->dispatch_width == 16) { + fail("Can't support (non-uniform) control flow on 16-wide\n"); + } + if (ir->counter) { this->base_ir = ir->counter; ir->counter->accept(this); @@ -1881,6 +1915,11 @@ fs_visitor::emit(fs_inst inst) fs_inst *list_inst = new(mem_ctx) fs_inst; *list_inst = inst; + if (force_uncompressed_stack > 0) + list_inst->force_uncompressed = true; + else if (force_sechalf_stack > 0) + list_inst->force_sechalf = true; + list_inst->annotation = this->current_annotation; list_inst->ir = this->base_ir; @@ -2006,6 +2045,7 @@ fs_visitor::emit_fb_writes() this->current_annotation = "FB write header"; GLboolean header_present = GL_TRUE; int nr = 0; + int reg_width = c->dispatch_width / 8; if (intel->gen >= 6 && !this->kill_emitted && @@ -2019,31 +2059,44 @@ fs_visitor::emit_fb_writes() } if (c->aa_dest_stencil_reg) { + push_force_uncompressed(); emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); + pop_force_uncompressed(); } /* Reserve space for color. It'll be filled in per MRT below. */ int color_mrf = nr; - nr += 4; + nr += 4 * reg_width; if (c->source_depth_to_render_target) { + if (intel->gen == 6 && c->dispatch_width == 16) { + /* For outputting oDepth on gen6, SIMD8 writes have to be + * used. This would require 8-wide moves of each half to + * message regs, kind of like pre-gen5 SIMD16 FB writes. + * Just bail on doing so for now. + */ + fail("Missing support for simd16 depth writes on gen6\n"); + } + if (c->computes_depth) { /* Hand over gl_FragDepth. */ assert(this->frag_depth); fs_reg depth = *(variable_storage(this->frag_depth)); - emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth); + emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth); } else { /* Pass through the payload depth. */ - emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), + emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); } + nr += reg_width; } if (c->dest_depth_reg) { - emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), + emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); + nr += reg_width; } fs_reg color = reg_undef; @@ -2060,7 +2113,7 @@ fs_visitor::emit_fb_writes() target); if (this->frag_color || this->frag_data) { for (int i = 0; i < 4; i++) { - emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color); + emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color); color.reg_offset++; } } @@ -2144,7 +2197,7 @@ fs_visitor::generate_fb_write(fs_inst *inst) brw_pop_insn_state(p); brw_fb_WRITE(p, - 8, /* dispatch_width */ + c->dispatch_width, inst->base_mrf, implied_header, inst->target, @@ -2608,8 +2661,12 @@ fs_visitor::setup_paramvalues_refs() void fs_visitor::assign_curb_setup() { - c->prog_data.first_curbe_grf = c->nr_payload_regs; c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; + if (c->dispatch_width == 8) { + c->prog_data.first_curbe_grf = c->nr_payload_regs; + } else { + c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; + } /* Map the offsets in the UNIFORM file to fixed HW regs. */ foreach_iter(exec_list_iterator, iter, this->instructions) { @@ -2618,7 +2675,7 @@ fs_visitor::assign_curb_setup() for (unsigned int i = 0; i < 3; i++) { if (inst->src[i].file == UNIFORM) { int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; - struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf + + struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + constant_nr / 8, constant_nr % 8); @@ -2670,7 +2727,7 @@ fs_visitor::calculate_urb_setup() void fs_visitor::assign_urb_setup() { - int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length; + int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; /* Offset all the urb_setup[] index by the actual position of the * setup regs, now that the location of the constants has been chosen. @@ -3516,7 +3573,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) void fs_visitor::generate_code() { - int last_native_inst = 0; + int last_native_inst = p->nr_insn; const char *last_annotation_string = NULL; ir_instruction *last_annotation_ir = NULL; @@ -3532,8 +3589,8 @@ fs_visitor::generate_code() if (unlikely(INTEL_DEBUG & DEBUG_WM)) { - printf("Native code for fragment shader %d:\n", - ctx->Shader.CurrentFragmentProgram->Name); + printf("Native code for fragment shader %d (%d-wide dispatch):\n", + ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width); } foreach_iter(exec_list_iterator, iter, this->instructions) { @@ -3566,6 +3623,14 @@ fs_visitor::generate_code() brw_set_predicate_inverse(p, inst->predicate_inverse); brw_set_saturate(p, inst->saturate); + if (inst->force_uncompressed || c->dispatch_width == 8) { + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + } else if (inst->force_sechalf) { + brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); + } else { + brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); + } + switch (inst->opcode) { case BRW_OPCODE_MOV: brw_MOV(p, dst, src[0]); @@ -3804,108 +3869,149 @@ fs_visitor::generate_code() } } -GLboolean -brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) +bool +fs_visitor::run() { - struct intel_context *intel = &brw->intel; - struct gl_context *ctx = &intel->ctx; - struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; + uint32_t prog_offset_16 = 0; - if (!prog) - return GL_FALSE; + brw_wm_payload_setup(brw, c); - struct brw_shader *shader = - (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; - if (!shader) - return GL_FALSE; + if (c->dispatch_width == 16) { + if (c->prog_data.curb_read_length) { + /* Haven't hooked in support for uniforms through the 16-wide + * version yet. + */ + return GL_FALSE; + } - /* We always use 8-wide mode, at least for now. For one, flow - * control only works in 8-wide. Also, when we're fragment shader - * bound, we're almost always under register pressure as well, so - * 8-wide would save us from the performance cliff of spilling - * regs. - */ - c->dispatch_width = 8; + /* align to 64 byte boundary. */ + while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { + brw_NOP(p); + } - if (unlikely(INTEL_DEBUG & DEBUG_WM)) { - printf("GLSL IR for native fragment shader %d:\n", prog->Name); - _mesa_print_ir(shader->ir, NULL); - printf("\n"); - } + /* Save off the start of this 16-wide program in case we succeed. */ + prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); - /* Now the main event: Visit the shader IR and generate our FS IR for it. - */ - fs_visitor v(c, shader); + brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); + } if (0) { - v.emit_dummy_fs(); + emit_dummy_fs(); } else { - v.calculate_urb_setup(); + calculate_urb_setup(); if (intel->gen < 6) - v.emit_interpolation_setup_gen4(); + emit_interpolation_setup_gen4(); else - v.emit_interpolation_setup_gen6(); + emit_interpolation_setup_gen6(); /* Generate FS IR for main(). (the visitor only descends into * functions called "main"). */ foreach_iter(exec_list_iterator, iter, *shader->ir) { ir_instruction *ir = (ir_instruction *)iter.get(); - v.base_ir = ir; - ir->accept(&v); + base_ir = ir; + ir->accept(this); } - v.emit_fb_writes(); + emit_fb_writes(); - v.split_virtual_grfs(); + split_virtual_grfs(); - v.setup_paramvalues_refs(); - v.setup_pull_constants(); + setup_paramvalues_refs(); + setup_pull_constants(); bool progress; do { progress = false; - progress = v.remove_duplicate_mrf_writes() || progress; + progress = remove_duplicate_mrf_writes() || progress; - progress = v.propagate_constants() || progress; - progress = v.register_coalesce() || progress; - progress = v.compute_to_mrf() || progress; - progress = v.dead_code_eliminate() || progress; + progress = propagate_constants() || progress; + progress = register_coalesce() || progress; + progress = compute_to_mrf() || progress; + progress = dead_code_eliminate() || progress; } while (progress); - v.schedule_instructions(); + schedule_instructions(); - v.assign_curb_setup(); - v.assign_urb_setup(); + assign_curb_setup(); + assign_urb_setup(); if (0) { /* Debug of register spilling: Go spill everything. */ - int virtual_grf_count = v.virtual_grf_next; + int virtual_grf_count = virtual_grf_next; for (int i = 1; i < virtual_grf_count; i++) { - v.spill_reg(i); + spill_reg(i); } } if (0) - v.assign_regs_trivial(); + assign_regs_trivial(); else { - while (!v.assign_regs()) { - if (v.failed) + while (!assign_regs()) { + if (failed) break; } } } + assert(force_uncompressed_stack == 0); + assert(force_sechalf_stack == 0); - if (!v.failed) - v.generate_code(); - - assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */ + if (!failed) + generate_code(); - if (v.failed) + if (failed) return GL_FALSE; - c->prog_data.total_grf = v.grf_used; + if (c->dispatch_width == 8) { + c->prog_data.total_grf = grf_used; + } else { + c->prog_data.total_grf_16 = grf_used; + c->prog_data.prog_offset_16 = prog_offset_16; + } + + return !failed; +} - return GL_TRUE; +bool +brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) +{ + struct intel_context *intel = &brw->intel; + struct gl_context *ctx = &intel->ctx; + struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; + + if (!prog) + return false; + + struct brw_shader *shader = + (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; + if (!shader) + return false; + + if (unlikely(INTEL_DEBUG & DEBUG_WM)) { + printf("GLSL IR for native fragment shader %d:\n", prog->Name); + _mesa_print_ir(shader->ir, NULL); + printf("\n"); + } + + /* Now the main event: Visit the shader IR and generate our FS IR for it. + */ + c->dispatch_width = 8; + + fs_visitor v(c, shader); + if (!v.run()) { + /* FINISHME: Cleanly fail, test at link time, etc. */ + assert(!"not reached"); + return false; + } + + if (intel->gen >= 6) { + c->dispatch_width = 16; + fs_visitor v2(c, shader); + v2.run(); + } + + c->prog_data.dispatch_width = 8; + + return true; } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index fd83fcb3829..b158992071e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -343,6 +343,8 @@ public: bool eot; bool header_present; bool shadow_compare; + bool force_uncompressed; + bool force_sechalf; uint32_t offset; /* spill/unspill offset */ /** @{ @@ -405,6 +407,8 @@ public: this->live_intervals_valid = false; this->kill_emitted = false; + this->force_uncompressed_stack = 0; + this->force_sechalf_stack = 0; } ~fs_visitor() @@ -461,6 +465,7 @@ public: return emit(fs_inst(opcode, dst, src0, src1, src2)); } + bool run(); void setup_paramvalues_refs(); void assign_curb_setup(); void calculate_urb_setup(); @@ -481,6 +486,11 @@ public: void schedule_instructions(); void fail(const char *msg, ...); + void push_force_uncompressed(); + void pop_force_uncompressed(); + void push_force_sechalf(); + void pop_force_sechalf(); + void generate_code(); void generate_fb_write(fs_inst *inst); void generate_pixel_xy(struct brw_reg dst, bool is_x); @@ -568,6 +578,9 @@ public: fs_reg reg_null_cmp; int grf_used; + + int force_uncompressed_stack; + int force_sechalf_stack; }; GLboolean brw_do_channel_expressions(struct exec_list *instructions); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index c4b2157db55..4564fb6b1ad 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -120,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) brw_wm_emit(c); } -static void +void brw_wm_payload_setup(struct brw_context *brw, struct brw_wm_compile *c) { @@ -225,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw, brw_init_compile(brw, &c->func); - brw_wm_payload_setup(brw, c); - if (!brw_wm_fs_emit(brw, c)) { - /* - * Shader which use GLSL features such as flow control are handled - * differently from "simple" shaders. - */ + /* Fallback for fixed function and ARB_fp shaders. */ c->dispatch_width = 16; brw_wm_payload_setup(brw, c); brw_wm_non_glsl_emit(brw, c); + c->prog_data.dispatch_width = 16; } - c->prog_data.dispatch_width = c->dispatch_width; /* Scratch space is used for register spilling */ if (c->last_scratch) { @@ -467,7 +462,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw) struct brw_wm_prog_key key; struct brw_fragment_program *fp = (struct brw_fragment_program *) brw->fragment_program; - + brw_wm_populate_key(brw, &key); /* Make an early check for the key. diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index 5d1e4045928..8e5a9cdb86c 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -314,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c, void brw_wm_lookup_iz(struct intel_context *intel, struct brw_wm_compile *c); -GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c); +bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c); /* brw_wm_emit.c */ void emit_alu1(struct brw_compile *p, @@ -474,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint bool brw_color_buffer_write_enabled(struct brw_context *brw); bool brw_render_target_supported(gl_format format); +void brw_wm_payload_setup(struct brw_context *brw, + struct brw_wm_compile *c); #endif diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 8215cb15a9c..d4fca788cb9 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -143,14 +143,19 @@ upload_wm_state(struct brw_context *brw) dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT; dw4 |= (brw->wm.prog_data->first_curbe_grf << GEN6_WM_DISPATCH_START_GRF_SHIFT_0); + dw4 |= (brw->wm.prog_data->first_curbe_grf_16 << + GEN6_WM_DISPATCH_START_GRF_SHIFT_2); dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; /* CACHE_NEW_WM_PROG */ - if (brw->wm.prog_data->dispatch_width == 8) + if (brw->wm.prog_data->dispatch_width == 8) { dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - else + if (brw->wm.prog_data->prog_offset_16) + dw5 |= GEN6_WM_16_DISPATCH_ENABLE; + } else { dw5 |= GEN6_WM_16_DISPATCH_ENABLE; + } /* _NEW_LINE */ if (ctx->Line.StippleFlag) @@ -194,7 +199,12 @@ upload_wm_state(struct brw_context *brw) OUT_BATCH(dw5); OUT_BATCH(dw6); OUT_BATCH(0); /* kernel 1 pointer */ - OUT_BATCH(0); /* kernel 2 pointer */ + if (brw->wm.prog_data->prog_offset_16) { + OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, + brw->wm.prog_data->prog_offset_16); + } else { + OUT_BATCH(0); /* kernel 2 pointer */ + } ADVANCE_BATCH(); }