From 5af0604d528733af9113a6f8711c39796ce0ae40 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 6 Apr 2015 17:44:40 -0700 Subject: [PATCH] i965/fs: Calculate delta_x and delta_y together. This lets SIMD16 programs on G45 and Gen5 use the PLN instruction. On Ironlake: total instructions in shared programs: 5634757 -> 5518055 (-2.07%) instructions in affected programs: 1745837 -> 1629135 (-6.68%) helped: 11439 HURT: 4 Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs.cpp | 46 ++++++----------- src/mesa/drivers/dri/i965/brw_fs.h | 3 +- .../drivers/dri/i965/brw_fs_generator.cpp | 25 +++++++-- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 13 +++-- .../drivers/dri/i965/brw_fs_reg_allocate.cpp | 8 +-- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 51 +++++++++---------- src/mesa/drivers/dri/i965/brw_reg.h | 7 +++ 7 files changed, 79 insertions(+), 74 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 5cdc19caab0..cf1c385f098 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1265,8 +1265,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer, emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)))); } else { emit(FS_OPCODE_LINTERP, wpos, - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], interp_reg(VARYING_SLOT_POS, 2)); } wpos = offset(wpos, 1); @@ -1308,8 +1307,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; } return emit(FS_OPCODE_LINTERP, attr, - this->delta_x[barycoord_mode], - this->delta_y[barycoord_mode], interp); + this->delta_xy[barycoord_mode], interp); } void @@ -1859,8 +1857,8 @@ fs_visitor::assign_urb_setup() */ foreach_block_and_inst(block, fs_inst, inst, cfg) { if (inst->opcode == FS_OPCODE_LINTERP) { - assert(inst->src[2].file == HW_REG); - inst->src[2].fixed_hw_reg.nr += urb_start; + assert(inst->src[1].file == HW_REG); + inst->src[1].fixed_hw_reg.nr += urb_start; } if (inst->opcode == FS_OPCODE_CINTERP) { @@ -2114,25 +2112,16 @@ fs_visitor::compact_virtual_grfs() } } - /* Patch all the references to delta_x/delta_y, since they're used in - * register allocation. If they're unused, switch them to BAD_FILE so - * we don't think some random VGRF is delta_x/delta_y. + /* Patch all the references to delta_xy, since they're used in register + * allocation. If they're unused, switch them to BAD_FILE so we don't + * think some random VGRF is delta_xy. */ - for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) { - if (delta_x[i].file == GRF) { - if (remap_table[delta_x[i].reg] != -1) { - delta_x[i].reg = remap_table[delta_x[i].reg]; + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { + if (delta_xy[i].file == GRF) { + if (remap_table[delta_xy[i].reg] != -1) { + delta_xy[i].reg = remap_table[delta_xy[i].reg]; } else { - delta_x[i].file = BAD_FILE; - } - } - } - for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) { - if (delta_y[i].file == GRF) { - if (remap_table[delta_y[i].reg] != -1) { - delta_y[i].reg = remap_table[delta_y[i].reg]; - } else { - delta_y[i].file = BAD_FILE; + delta_xy[i].file = BAD_FILE; } } } @@ -2685,14 +2674,9 @@ fs_visitor::opt_register_renaming() if (progress) { invalidate_live_intervals(); - for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) { - if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) { - delta_x[i].reg = remap[delta_x[i].reg]; - } - } - for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) { - if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) { - delta_y[i].reg = remap[delta_y[i].reg]; + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { + if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) { + delta_xy[i].reg = remap[delta_xy[i].reg]; } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index d625d91e60e..24ca43ccdbe 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -514,8 +514,7 @@ public: fs_reg pixel_y; fs_reg wpos_w; fs_reg pixel_w; - fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; - fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; + fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg shader_start_time; fs_reg userplane[MAX_CLIP_PLANES]; diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 353f35adda0..495564058e0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -391,12 +391,31 @@ void fs_generator::generate_linterp(fs_inst *inst, struct brw_reg dst, struct brw_reg *src) { + /* PLN reads: + * / in SIMD16 \ + * ----------------------------------- + * | src1+0 | src1+1 | src1+2 | src1+3 | + * |-----------------------------------| + * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| + * ----------------------------------- + * + * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: + * + * ----------------------------------- + * | src1+0 | src1+1 | src1+2 | src1+3 | + * |-----------------------------------| + * |(x0, x1)|(y0, y1)| | | in SIMD8 + * |-----------------------------------| + * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 + * ----------------------------------- + * + * See also: emit_interpolation_setup_gen4(). + */ struct brw_reg delta_x = src[0]; - struct brw_reg delta_y = src[1]; - struct brw_reg interp = src[2]; + struct brw_reg delta_y = offset(src[0], dispatch_width / 8); + struct brw_reg interp = src[1]; if (brw->has_pln && - delta_y.nr == delta_x.nr + 1 && (brw->gen >= 7 || (delta_x.nr & 1) == 0)) { brw_PLN(p, dst, interp, delta_x); } else { diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 3972581ef12..e1687edb2b1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1482,8 +1482,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) */ no16("interpolate_at_* not yet supported in SIMD16 mode."); - fs_reg dst_x = vgrf(2); - fs_reg dst_y = offset(dst_x, 1); + fs_reg dst_xy = vgrf(2); /* For most messages, we need one reg of ignored data; the hardware * requires mlen==1 even when there is no payload. in the per-slot @@ -1495,7 +1494,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) switch (instr->intrinsic) { case nir_intrinsic_interp_var_at_centroid: - inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u)); + inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u)); break; case nir_intrinsic_interp_var_at_sample: { @@ -1503,7 +1502,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); assert(const_sample); unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0; - inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, + inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data)); break; } @@ -1515,7 +1514,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; - inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src, + inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src, fs_reg(off_x | (off_y << 4))); } else { src = vgrf(glsl_type::ivec2_type); @@ -1548,7 +1547,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) } mlen = 2; - inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src, + inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src, fs_reg(0u)); } break; @@ -1567,7 +1566,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); src.type = dest.type; - emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src); + emit(FS_OPCODE_LINTERP, dest, dst_xy, src); dest = offset(dest, 1); } break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 2a4054a29b7..47f5a42b187 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -244,7 +244,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) } assert(reg == ra_reg_count); - /* Add a special class for aligned pairs, which we'll put delta_x/y + /* Add a special class for aligned pairs, which we'll put delta_xy * in on Gen <= 6 so that we can do PLN. */ if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) { @@ -558,14 +558,14 @@ fs_visitor::assign_regs(bool allow_spilling) * second operand of a PLN instruction needs to be an * even-numbered register, so we have a special register class * wm_aligned_pairs_class to handle this case. pre-GEN6 always - * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the + * uses this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the * second operand of a PLN instruction (since it doesn't support * any other interpolation modes). So all we need to do is find * that register and set it to the appropriate class. */ if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 && - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF && - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF && + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { c = screen->wm_reg_sets[rsi].aligned_pairs_class; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 98c6988f6ad..7fdd4e566fa 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -593,8 +593,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) /* 1. collect interpolation factors */ - fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1)); - fs_reg dst_y = offset(dst_x, 1); + fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1)); /* for most messages, we need one reg of ignored data; the hardware requires mlen==1 * even when there is no payload. in the per-slot offset case, we'll replace this with @@ -606,7 +605,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) switch (ir->operation) { case ir_unop_interpolate_at_centroid: - inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u)); + inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u)); break; case ir_binop_interpolate_at_sample: { @@ -614,7 +613,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) assert(sample_num || !"nonconstant sample number should have been lowered."); unsigned msg_data = sample_num->value.i[0] << 4; - inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data)); + inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data)); break; } @@ -623,7 +622,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) if (const_offset) { unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) | (pack_pixel_offset(const_offset->value.f[1]) << 4); - inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src, + inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src, fs_reg(msg_data)); } else { /* pack the operands: hw wants offsets as 4 bit signed ints */ @@ -656,7 +655,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) } mlen = 2 * reg_width; - inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src, + inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src, fs_reg(0u)); } break; @@ -678,8 +677,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) for (int i = 0; i < ir->type->vector_elements; i++) { int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i; - emit(FS_OPCODE_LINTERP, res, - dst_x, dst_y, + emit(FS_OPCODE_LINTERP, res, dst_xy, fs_reg(interp_reg(var->data.location, ch))); res = offset(res, 1); } @@ -3443,31 +3441,31 @@ fs_visitor::emit_interpolation_setup_gen4() fs_reg(brw_imm_v(0x11001100)))); this->current_annotation = "compute pixel deltas from v0"; - if (brw->has_pln) { - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = - vgrf(glsl_type::vec2_type); - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = - offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1); + + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = + vgrf(glsl_type::vec2_type); + const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC]; + const fs_reg xstart(negate(brw_vec1_grf(1, 0))); + const fs_reg ystart(negate(brw_vec1_grf(1, 1))); + + if (brw->has_pln && dispatch_width == 16) { + emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart)); + emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart)); + emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart)) + ->force_sechalf = true; + emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart)) + ->force_sechalf = true; } else { - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = - vgrf(glsl_type::float_type); - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = - vgrf(glsl_type::float_type); + emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart)); + emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart)); } - emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))))); - emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))))); this->current_annotation = "compute pos.w and 1/pos.w"; /* Compute wpos.w. It's always in our setup, since it's needed to * interpolate the other attributes. */ this->wpos_w = vgrf(glsl_type::float_type); - emit(FS_OPCODE_LINTERP, wpos_w, - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - interp_reg(VARYING_SLOT_POS, 3)); + emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3)); /* Compute the pixel 1/W value from wpos.w. */ this->pixel_w = vgrf(glsl_type::float_type); emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w); @@ -3509,8 +3507,7 @@ fs_visitor::emit_interpolation_setup_gen6() for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) { uint8_t reg = payload.barycentric_coord_reg[i]; - this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0)); - this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0)); + this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0)); } this->current_annotation = NULL; diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index 3a50e864aad..1b2bb107f07 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -704,6 +704,13 @@ brw_vec8_grf(unsigned nr, unsigned subnr) return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); } +/** Construct float[16] general-purpose register */ +static inline struct brw_reg +brw_vec16_grf(unsigned nr, unsigned subnr) +{ + return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + static inline struct brw_reg brw_uw8_grf(unsigned nr, unsigned subnr) -- 2.30.2