From: Matt Turner Date: Tue, 14 Apr 2015 20:17:38 +0000 (-0700) Subject: i965/fs: Combine pixel center calculation into one inst. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=529064f6;p=mesa.git i965/fs: Combine pixel center calculation into one inst. The X and Y values come interleaved in g1 (.4-.11 inclusive), so we can calculate them together with a single add(32) instruction on some platforms like Broadwell and newer or in SIMD8 elsewhere. Note that I also moved the PIXEL_X/PIXEL_Y virtual opcodes from before LINTERP to after it. That's because the writes_accumulator_implicitly() function in backend_instruction tests for <= LINTERP for determining whether the instruction indeed writes the accumulator implicitly. The old FS_OPCODE_PIXEL_X/Y emitted ADD instructions, which did, but the new opcodes just emit MOVs, which don't. It doesn't matter, since we don't use these opcodes on Gen4/5 anymore, but in the case that we do... On Broadwell: total instructions in shared programs: 7192355 -> 7186224 (-0.09%) instructions in affected programs: 1190700 -> 1184569 (-0.51%) helped: 6131 On Haswell: total instructions in shared programs: 6155979 -> 6152800 (-0.05%) instructions in affected programs: 652362 -> 649183 (-0.49%) helped: 3179 Reviewed-by: Jason Ekstrand --- diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 5962b000d89..bd3218a1d3e 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -925,6 +925,8 @@ enum opcode { FS_OPCODE_DDY_FINE, FS_OPCODE_CINTERP, FS_OPCODE_LINTERP, + FS_OPCODE_PIXEL_X, + FS_OPCODE_PIXEL_Y, FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, FS_OPCODE_VARYING_PULL_CONSTANT_LOAD, diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 495564058e0..8d34d8a78d9 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -1940,6 +1940,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case FS_OPCODE_LINTERP: generate_linterp(inst, dst, src); break; + case FS_OPCODE_PIXEL_X: + assert(src[0].type == BRW_REGISTER_TYPE_UW); + src[0].subnr = 0 * type_sz(src[0].type); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + break; + case FS_OPCODE_PIXEL_Y: + assert(src[0].type == BRW_REGISTER_TYPE_UW); + src[0].subnr = 4 * type_sz(src[0].type); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + break; case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 7fdd4e566fa..c66ec3ee3b0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -3478,27 +3478,58 @@ fs_visitor::emit_interpolation_setup_gen6() { struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); - /* If the pixel centers end up used, the setup is the same as for gen4. */ this->current_annotation = "compute pixel centers"; - fs_reg int_pixel_x = vgrf(glsl_type::uint_type); - fs_reg int_pixel_y = vgrf(glsl_type::uint_type); - int_pixel_x.type = BRW_REGISTER_TYPE_UW; - int_pixel_y.type = BRW_REGISTER_TYPE_UW; - emit(ADD(int_pixel_x, - fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), - fs_reg(brw_imm_v(0x10101010)))); - emit(ADD(int_pixel_y, - fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), - fs_reg(brw_imm_v(0x11001100)))); - - /* As of gen6, we can no longer mix float and int sources. We have - * to turn the integer pixel centers into floats for their actual - * use. - */ - this->pixel_x = vgrf(glsl_type::float_type); - this->pixel_y = vgrf(glsl_type::float_type); - emit(MOV(this->pixel_x, int_pixel_x)); - emit(MOV(this->pixel_y, int_pixel_y)); + if (brw->gen >= 8 || dispatch_width == 8) { + /* The "Register Region Restrictions" page says for BDW (and newer, + * presumably): + * + * "When destination spans two registers, the source may be one or + * two registers. The destination elements must be evenly split + * between the two registers." + * + * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to + * compute our pixel centers. + */ + fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8), + BRW_REGISTER_TYPE_UW, dispatch_width * 2); + emit(ADD(int_pixel_xy, + fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)), + fs_reg(brw_imm_v(0x11001010)))) + ->force_writemask_all = true; + + this->pixel_x = vgrf(glsl_type::float_type); + this->pixel_y = vgrf(glsl_type::float_type); + emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy); + emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy); + } else { + /* The "Register Region Restrictions" page says for SNB, IVB, HSW: + * + * "When destination spans two registers, the source MUST span two + * registers." + * + * Since the GRF source of the ADD will only read a single register, we + * must do two separate ADDs in SIMD16. + */ + fs_reg int_pixel_x = vgrf(glsl_type::uint_type); + fs_reg int_pixel_y = vgrf(glsl_type::uint_type); + int_pixel_x.type = BRW_REGISTER_TYPE_UW; + int_pixel_y.type = BRW_REGISTER_TYPE_UW; + emit(ADD(int_pixel_x, + fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), + fs_reg(brw_imm_v(0x10101010)))); + emit(ADD(int_pixel_y, + fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), + fs_reg(brw_imm_v(0x11001100)))); + + /* As of gen6, we can no longer mix float and int sources. We have + * to turn the integer pixel centers into floats for their actual + * use. + */ + this->pixel_x = vgrf(glsl_type::float_type); + this->pixel_y = vgrf(glsl_type::float_type); + emit(MOV(this->pixel_x, int_pixel_x)); + emit(MOV(this->pixel_y, int_pixel_y)); + } this->current_annotation = "compute pos.w"; this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));