From ea681a0d64ecde3a2e729fe3b71d3f3fe4cedff0 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 9 Nov 2012 01:05:47 -0800 Subject: [PATCH] i965/fs: Split final assembly code generation out of fs_visitor. Compiling shaders requires several main steps: 1. Generating FS IR from either GLSL IR or Mesa IR 2. Optimizing the IR 3. Register allocation 4. Generating assembly code This patch splits out step 4 into a separate class named "fs_generator." There are several reasons for doing so: 1. Future hardware has a different instruction encoding. Splitting this out will allow us to replace fs_generator (which relies heavily on the brw_eu_emit.c code and struct brw_instruction) with a new code generator that writes the new format. 2. It reduces the size of the fs_visitor monolith. (Arguably, a lot more should be split out, but that's left for "future work.") 3. Separate namespaces allow us to make helper functions for generating instructions in both classes: ADD() can exist in fs_visitor and create IR, while ADD() in fs_generator() can create brw_instructions. (Patches for this upcoming.) Furthermore, this patch changes the order of operations slightly. Rather than doing steps 1-4 for SIMD8, then 1-4 for SIMD16, we now: - Do steps 1-3 for SIMD8, then repeat 1-3 for SIMD16 - Generate final assembly code for both modes together This is because the frontend work can be done independently, but final assembly generation needs to pack both into a single program store to feed the GPU. Reviewed-by: Eric Anholt Reviewed-by: Paul Berry --- src/mesa/drivers/dri/i965/brw_fs.cpp | 31 ++---- src/mesa/drivers/dri/i965/brw_fs.h | 110 +++++++++++++++------- src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 93 ++++++++++++++---- 3 files changed, 156 insertions(+), 78 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index e2873ce9756..a5708291771 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2025,7 +2025,6 @@ fs_visitor::setup_payload_gen6() bool fs_visitor::run() { - uint32_t prog_offset_16 = 0; uint32_t orig_nr_params = c->prog_data.nr_params; if (intel->gen >= 6) @@ -2033,24 +2032,6 @@ fs_visitor::run() else setup_payload_gen4(); - if (dispatch_width == 16) { - /* We have to do a compaction pass now, or the one at the end of - * execution will squash down where our prog_offset start needs - * to be. - */ - brw_compact_instructions(p); - - /* align to 64 byte boundary. */ - while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { - brw_NOP(p); - } - - /* Save off the start of this 16-wide program in case we succeed. */ - prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); - - brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); - } - if (0) { emit_dummy_fs(); } else { @@ -2129,13 +2110,10 @@ fs_visitor::run() if (failed) return false; - generate_code(); - if (dispatch_width == 8) { c->prog_data.reg_blocks = brw_register_blocks(grf_used); } else { c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); - c->prog_data.prog_offset_16 = prog_offset_16; /* Make sure we didn't try to sneak in an extra uniform */ assert(orig_nr_params == c->prog_data.nr_params); @@ -2192,12 +2170,15 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, return NULL; } + exec_list *simd16_instructions = NULL; + fs_visitor v2(brw, c, prog, fp, 16); if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { - fs_visitor v2(brw, c, prog, fp, 16); v2.import_uniforms(&v); if (!v2.run()) { perf_debug("16-wide shader failed to compile, falling back to " "8-wide at a 10-20%% performance cost: %s", v2.fail_msg); + } else { + simd16_instructions = &v2.instructions; } } @@ -2214,7 +2195,9 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, } } - return brw_get_program(&c->func, final_assembly_size); + fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE); + return g.generate_assembly(&v.instructions, simd16_instructions, + final_assembly_size); } bool diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 8225e66eaa8..489b9700523 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -180,6 +180,11 @@ public: /** @} */ }; +/** + * The fragment shader front-end. + * + * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR. + */ class fs_visitor : public backend_visitor { public: @@ -293,40 +298,6 @@ public: void push_force_sechalf(); void pop_force_sechalf(); - void generate_code(); - void generate_fb_write(fs_inst *inst); - void generate_pixel_xy(struct brw_reg dst, bool is_x); - void generate_linterp(fs_inst *inst, struct brw_reg dst, - struct brw_reg *src); - void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src); - void generate_math1_gen7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src); - void generate_math2_gen7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - void generate_math1_gen6(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src); - void generate_math2_gen6(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1); - void generate_math_gen4(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src); - void generate_discard(fs_inst *inst); - void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src); - void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, - bool negate_value); - void generate_spill(fs_inst *inst, struct brw_reg src); - void generate_unspill(fs_inst *inst, struct brw_reg dst); - void generate_pull_constant_load(fs_inst *inst, struct brw_reg dst, - struct brw_reg index, - struct brw_reg offset); - void generate_mov_dispatch_to_flags(); - void emit_dummy_fs(); fs_reg *emit_fragcoord_interpolation(ir_variable *ir); fs_inst *emit_linterp(const fs_reg &attr, const fs_reg &interp, @@ -456,6 +427,77 @@ public: int force_sechalf_stack; }; +/** + * The fragment shader code generator. + * + * Translates FS IR to actual i965 assembly code. + */ +class fs_generator +{ +public: + fs_generator(struct brw_context *brw, + struct brw_wm_compile *c, + struct gl_shader_program *prog, + struct gl_fragment_program *fp, + bool dual_source_output); + ~fs_generator(); + + const unsigned *generate_assembly(exec_list *simd8_instructions, + exec_list *simd16_instructions, + unsigned *assembly_size); + +private: + void generate_code(exec_list *instructions); + void generate_fb_write(fs_inst *inst); + void generate_pixel_xy(struct brw_reg dst, bool is_x); + void generate_linterp(fs_inst *inst, struct brw_reg dst, + struct brw_reg *src); + void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src); + void generate_math1_gen7(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src); + void generate_math2_gen7(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1); + void generate_math1_gen6(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src); + void generate_math2_gen6(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src0, + struct brw_reg src1); + void generate_math_gen4(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src); + void generate_discard(fs_inst *inst); + void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src); + void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, + bool negate_value); + void generate_spill(fs_inst *inst, struct brw_reg src); + void generate_unspill(fs_inst *inst, struct brw_reg dst); + void generate_pull_constant_load(fs_inst *inst, struct brw_reg dst, + struct brw_reg index, + struct brw_reg offset); + void generate_mov_dispatch_to_flags(); + + struct brw_context *brw; + struct intel_context *intel; + struct gl_context *ctx; + + struct brw_compile *p; + struct brw_wm_compile *c; + + struct gl_shader_program *prog; + struct gl_shader *shader; + const struct gl_fragment_program *fp; + + unsigned dispatch_width; /**< 8 or 16 */ + + bool dual_source_output; + void *mem_ctx; +}; + bool brw_do_channel_expressions(struct exec_list *instructions); bool brw_do_vector_splitting(struct exec_list *instructions); bool brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog); diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index f04d90db1f7..8e03255356b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -37,8 +37,29 @@ extern "C" { #include "brw_cfg.h" #include "glsl/ir_print_visitor.h" +fs_generator::fs_generator(struct brw_context *brw, + struct brw_wm_compile *c, + struct gl_shader_program *prog, + struct gl_fragment_program *fp, + bool dual_source_output) + + : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output) +{ + p = &c->func; + intel = &brw->intel; + ctx = &intel->ctx; + + shader = prog ? prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL; + + mem_ctx = c; +} + +fs_generator::~fs_generator() +{ +} + void -fs_visitor::generate_fb_write(fs_inst *inst) +fs_generator::generate_fb_write(fs_inst *inst) { bool eot = inst->eot; struct brw_reg implied_header; @@ -91,7 +112,7 @@ fs_visitor::generate_fb_write(fs_inst *inst) implied_header = brw_null_reg(); } - if (this->dual_src_output.file != BAD_FILE) + if (this->dual_source_output) msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; else if (dispatch_width == 16) msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; @@ -119,7 +140,7 @@ fs_visitor::generate_fb_write(fs_inst *inst) * interpolation. */ void -fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x) +fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x) { struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); struct brw_reg src; @@ -147,7 +168,7 @@ fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x) } void -fs_visitor::generate_linterp(fs_inst *inst, +fs_generator::generate_linterp(fs_inst *inst, struct brw_reg dst, struct brw_reg *src) { struct brw_reg delta_x = src[0]; @@ -165,7 +186,7 @@ fs_visitor::generate_linterp(fs_inst *inst, } void -fs_visitor::generate_math1_gen7(fs_inst *inst, +fs_generator::generate_math1_gen7(fs_inst *inst, struct brw_reg dst, struct brw_reg src0) { @@ -178,7 +199,7 @@ fs_visitor::generate_math1_gen7(fs_inst *inst, } void -fs_visitor::generate_math2_gen7(fs_inst *inst, +fs_generator::generate_math2_gen7(fs_inst *inst, struct brw_reg dst, struct brw_reg src0, struct brw_reg src1) @@ -188,7 +209,7 @@ fs_visitor::generate_math2_gen7(fs_inst *inst, } void -fs_visitor::generate_math1_gen6(fs_inst *inst, +fs_generator::generate_math1_gen6(fs_inst *inst, struct brw_reg dst, struct brw_reg src0) { @@ -215,7 +236,7 @@ fs_visitor::generate_math1_gen6(fs_inst *inst, } void -fs_visitor::generate_math2_gen6(fs_inst *inst, +fs_generator::generate_math2_gen6(fs_inst *inst, struct brw_reg dst, struct brw_reg src0, struct brw_reg src1) @@ -235,7 +256,7 @@ fs_visitor::generate_math2_gen6(fs_inst *inst, } void -fs_visitor::generate_math_gen4(fs_inst *inst, +fs_generator::generate_math_gen4(fs_inst *inst, struct brw_reg dst, struct brw_reg src) { @@ -263,7 +284,7 @@ fs_visitor::generate_math_gen4(fs_inst *inst, } void -fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) +fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) { int msg_type = -1; int rlen = 4; @@ -447,7 +468,7 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) */ void -fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) +fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) { struct brw_reg src0 = brw_reg(src.file, src.nr, 1, BRW_REGISTER_TYPE_F, @@ -469,7 +490,7 @@ fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) * left. */ void -fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, +fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, bool negate_value) { struct brw_reg src0 = brw_reg(src.file, src.nr, 0, @@ -491,7 +512,7 @@ fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, } void -fs_visitor::generate_discard(fs_inst *inst) +fs_generator::generate_discard(fs_inst *inst) { struct brw_reg f0 = brw_flag_reg(); @@ -543,7 +564,7 @@ fs_visitor::generate_discard(fs_inst *inst) } void -fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) +fs_generator::generate_spill(fs_inst *inst, struct brw_reg src) { assert(inst->mlen != 0); @@ -555,7 +576,7 @@ fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) } void -fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) +fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst) { assert(inst->mlen != 0); @@ -585,7 +606,7 @@ fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) } void -fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst, +fs_generator::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst, struct brw_reg index, struct brw_reg offset) { @@ -632,7 +653,7 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst, * Used only on Gen6 and above. */ void -fs_visitor::generate_mov_dispatch_to_flags() +fs_generator::generate_mov_dispatch_to_flags() { struct brw_reg f0 = brw_flag_reg(); struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); @@ -722,7 +743,7 @@ brw_reg_from_fs_reg(fs_reg *reg) } void -fs_visitor::generate_code() +fs_generator::generate_code(exec_list *instructions) { int last_native_insn_offset = p->next_insn_offset; const char *last_annotation_string = NULL; @@ -740,9 +761,9 @@ fs_visitor::generate_code() cfg_t *cfg = NULL; if (unlikely(INTEL_DEBUG & DEBUG_WM)) - cfg = new(mem_ctx) cfg_t(this); + cfg = new(mem_ctx) cfg_t(mem_ctx, instructions); - foreach_list(node, &this->instructions) { + foreach_list(node, instructions) { fs_inst *inst = (fs_inst *)node; struct brw_reg src[3], dst; @@ -1054,3 +1075,35 @@ fs_visitor::generate_code() brw_dump_compile(p, stdout, 0, p->next_insn_offset); } } + +const unsigned * +fs_generator::generate_assembly(exec_list *simd8_instructions, + exec_list *simd16_instructions, + unsigned *assembly_size) +{ + dispatch_width = 8; + generate_code(simd8_instructions); + + if (simd16_instructions) { + /* We have to do a compaction pass now, or the one at the end of + * execution will squash down where our prog_offset start needs + * to be. + */ + brw_compact_instructions(p); + + /* align to 64 byte boundary. */ + while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) { + brw_NOP(p); + } + + /* Save off the start of this 16-wide program */ + c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction); + + brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); + + dispatch_width = 16; + generate_code(simd16_instructions); + } + + return brw_get_program(p, assembly_size); +} -- 2.30.2