From 1eef0b73aa323d94d5a080cd1efa81ccacdbd0d2 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 12 Jul 2016 03:57:25 -0700 Subject: [PATCH] i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke Reviewed-by: Chris Forbes Acked-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_compiler.c | 1 + src/mesa/drivers/dri/i965/brw_fs.cpp | 175 +++------- src/mesa/drivers/dri/i965/brw_fs.h | 9 +- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410 ++++++++++++----------- src/mesa/drivers/dri/i965/brw_nir.c | 16 +- 5 files changed, 270 insertions(+), 341 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c index a4855a09137..afb70a139ae 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.c +++ b/src/mesa/drivers/dri/i965/brw_compiler.c @@ -40,6 +40,7 @@ .lower_fdiv = true, \ .lower_flrp64 = true, \ .native_integers = true, \ + .use_interpolated_input_intrinsics = true, \ .vertex_id_zero_based = true static const struct nir_shader_compiler_options scalar_nir_options = { diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index fc91bbcfa46..4aae5742278 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1070,21 +1070,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg wpos) bld.MOV(wpos, this->wpos_w); } -static enum brw_barycentric_mode -barycentric_mode(enum glsl_interp_mode mode, - bool is_centroid, bool is_sample) +enum brw_barycentric_mode +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op) { - unsigned bary; - /* Barycentric modes don't make sense for flat inputs. */ assert(mode != INTERP_MODE_FLAT); - if (is_sample) { - bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; - } else if (is_centroid) { - bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; - } else { + unsigned bary; + switch (op) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL; + break; + case nir_intrinsic_load_barycentric_centroid: + bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; + break; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; + break; + default: + unreachable("invalid intrinsic"); } if (mode == INTERP_MODE_NOPERSPECTIVE) @@ -1104,107 +1110,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary) return (enum brw_barycentric_mode) ((unsigned) bary - 1); } -void -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, - const glsl_type *type, - glsl_interp_mode interpolation_mode, - int *location, bool mod_centroid, - bool mod_sample) -{ - assert(stage == MESA_SHADER_FRAGMENT); - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; - - if (type->is_array() || type->is_matrix()) { - const glsl_type *elem_type = glsl_get_array_element(type); - const unsigned length = glsl_get_length(type); - - for (unsigned i = 0; i < length; i++) { - emit_general_interpolation(attr, name, elem_type, interpolation_mode, - location, mod_centroid, mod_sample); - } - } else if (type->is_record()) { - for (unsigned i = 0; i < type->length; i++) { - const glsl_type *field_type = type->fields.structure[i].type; - emit_general_interpolation(attr, name, field_type, interpolation_mode, - location, mod_centroid, mod_sample); - } - } else { - assert(type->is_scalar() || type->is_vector()); - - if (prog_data->urb_setup[*location] == -1) { - /* If there's no incoming setup data for this slot, don't - * emit interpolation for it. - */ - *attr = offset(*attr, bld, type->vector_elements); - (*location)++; - return; - } - - attr->type = brw_type_for_base_type(type->get_scalar_type()); - - if (interpolation_mode == INTERP_MODE_FLAT) { - /* Constant interpolation (flat shading) case. The SF has - * handed us defined values in only the constant offset - * field of the setup reg. - */ - unsigned vector_elements = type->vector_elements; - - /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it is not - * 64-bit aligned and the current implementation fails to read the - * data properly. Instead, when there is a double input varying, - * read it as vector of floats with twice the number of components. - */ - if (attr->type == BRW_REGISTER_TYPE_DF) { - vector_elements *= 2; - attr->type = BRW_REGISTER_TYPE_F; - } - for (unsigned int i = 0; i < vector_elements; i++) { - struct brw_reg interp = interp_reg(*location, i); - interp = suboffset(interp, 3); - interp.type = attr->type; - bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp)); - *attr = offset(*attr, bld, 1); - } - } else { - /* Smooth/noperspective interpolation case. */ - enum brw_barycentric_mode bary = - barycentric_mode(interpolation_mode, mod_centroid, mod_sample); - - for (unsigned int i = 0; i < type->vector_elements; i++) { - fs_reg interp(interp_reg(*location, i)); - if (devinfo->needs_unlit_centroid_workaround && mod_centroid) { - /* Get the pixel/sample mask into f0 so that we know - * which pixels are lit. Then, for each channel that is - * unlit, replace the centroid data with non-centroid - * data. - */ - bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); - - fs_inst *inst; - inst = bld.emit(FS_OPCODE_LINTERP, *attr, - delta_xy[centroid_to_pixel(bary)], interp); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = true; - inst->no_dd_clear = true; - - inst = bld.emit(FS_OPCODE_LINTERP, *attr, - delta_xy[bary], interp); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = false; - inst->no_dd_check = true; - } else { - bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp); - } - if (devinfo->gen < 6 && interpolation_mode == INTERP_MODE_SMOOTH) { - bld.MUL(*attr, *attr, this->pixel_w); - } - *attr = offset(*attr, bld, 1); - } - } - (*location)++; - } -} - fs_reg * fs_visitor::emit_frontfacing_interpolation() { @@ -6330,6 +6235,10 @@ fs_visitor::run_cs() /** * Return a bitfield where bit n is set if barycentric interpolation mode n * (see enum brw_barycentric_mode) is needed by the fragment shader. + * + * We examine the load_barycentric intrinsics rather than looking at input + * variables so that we catch interpolateAtCentroid() messages too, which + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up. */ static unsigned brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, @@ -6337,29 +6246,37 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, { unsigned barycentric_interp_modes = 0; - nir_foreach_variable(var, &shader->inputs) { - /* Ignore WPOS; it doesn't require interpolation. */ - if (var->data.location == VARYING_SLOT_POS) + nir_foreach_function(f, shader) { + if (!f->impl) continue; - /* Flat inputs don't need barycentric modes. */ - if (var->data.interpolation == INTERP_MODE_FLAT) - continue; + nir_foreach_block(block, f->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - /* Determine the set (or sets) of barycentric coordinates needed to - * interpolate this variable. Note that when - * brw->needs_unlit_centroid_workaround is set, centroid interpolation - * uses PIXEL interpolation for unlit pixels and CENTROID interpolation - * for lit pixels, so we need both sets of barycentric coordinates. - */ - enum brw_barycentric_mode bary_mode = - barycentric_mode((glsl_interp_mode) var->data.interpolation, - var->data.centroid, var->data.sample); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) + continue; + + /* Ignore WPOS; it doesn't require interpolation. */ + if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS) + continue; - barycentric_interp_modes |= 1 << bary_mode; + intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); + enum glsl_interp_mode interp = (enum glsl_interp_mode) + nir_intrinsic_interp_mode(intrin); + nir_intrinsic_op bary_op = intrin->intrinsic; + enum brw_barycentric_mode bary = + brw_barycentric_mode(interp, bary_op); - if (var->data.centroid && devinfo->needs_unlit_centroid_workaround) - barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode); + barycentric_interp_modes |= 1 << bary; + + if (devinfo->needs_unlit_centroid_workaround && + bary_op == nir_intrinsic_load_barycentric_centroid) + barycentric_interp_modes |= 1 << centroid_to_pixel(bary); + } + } } return barycentric_interp_modes; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 7998f514155..574475f071a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -174,11 +174,6 @@ public: fs_reg *emit_samplepos_setup(); fs_reg *emit_sampleid_setup(); fs_reg *emit_samplemaskin_setup(); - void emit_general_interpolation(fs_reg *attr, const char *name, - const glsl_type *type, - glsl_interp_mode interpolation_mode, - int *location, bool mod_centroid, - bool mod_sample); fs_reg *emit_vs_system_value(int location); void emit_interpolation_setup_gen4(); void emit_interpolation_setup_gen6(); @@ -195,7 +190,6 @@ public: bool opt_zero_samples(); void emit_nir_code(); - void nir_setup_inputs(); void nir_setup_single_output_varying(fs_reg *reg, const glsl_type *type, unsigned *location); void nir_setup_outputs(); @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld, uint32_t components); fs_reg setup_imm_df(const brw::fs_builder &bld, double v); + +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode, + nir_intrinsic_op op); diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 44c4bdc63ab..22cba8c32db 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code() /* emit the arrays used for inputs and outputs - load/store intrinsics will * be converted to reads/writes of these arrays */ - nir_setup_inputs(); nir_setup_outputs(); nir_setup_uniforms(); nir_emit_system_values(); @@ -49,38 +48,6 @@ fs_visitor::emit_nir_code() } } -void -fs_visitor::nir_setup_inputs() -{ - if (stage != MESA_SHADER_FRAGMENT) - return; - - nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs); - - nir_foreach_variable(var, &nir->inputs) { - fs_reg input = offset(nir_inputs, bld, var->data.driver_location); - - fs_reg reg; - if (var->data.location == VARYING_SLOT_POS) { - emit_fragcoord_interpolation(input); - } else if (var->data.location == VARYING_SLOT_LAYER) { - struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, 1), 3); - reg.type = BRW_REGISTER_TYPE_D; - bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg); - } else if (var->data.location == VARYING_SLOT_VIEWPORT) { - struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT, 2), 3); - reg.type = BRW_REGISTER_TYPE_D; - bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg); - } else { - int location = var->data.location; - emit_general_interpolation(&input, var->name, var->type, - (glsl_interp_mode) var->data.interpolation, - &location, var->data.centroid, - var->data.sample); - } - } -} - void fs_visitor::nir_setup_single_output_varying(fs_reg *reg, const glsl_type *type, @@ -3141,7 +3108,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { assert(stage == MESA_SHADER_FRAGMENT); - const struct brw_wm_prog_key *wm_key = (const struct brw_wm_prog_key *) key; fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) @@ -3198,189 +3164,245 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, break; } - case nir_intrinsic_interp_var_at_centroid: - case nir_intrinsic_interp_var_at_sample: - case nir_intrinsic_interp_var_at_offset: { - /* Handle ARB_gpu_shader5 interpolation intrinsics - * - * It's worth a quick word of explanation as to why we handle the full - * variable-based interpolation intrinsic rather than a lowered version - * with like we do for other inputs. We have to do that because the way - * we set up inputs doesn't allow us to use the already setup inputs for - * interpolation. At the beginning of the shader, we go through all of - * the input variables and do the initial interpolation and put it in - * the nir_inputs array based on its location as determined in - * nir_lower_io. If the input isn't used, dead code cleans up and - * everything works fine. However, when we get to the ARB_gpu_shader5 - * interpolation intrinsics, we need to reinterpolate the input - * differently. If we used an intrinsic that just had an index it would - * only give us the offset into the nir_inputs array. However, this is - * useless because that value is post-interpolation and we need - * pre-interpolation. In order to get the actual location of the bits - * we get from the vertex fetching hardware, we need the variable. - */ - fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - const glsl_interp_mode interpolation = - (glsl_interp_mode) instr->variables[0]->var->data.interpolation; + case nir_intrinsic_load_input: { + /* load_input is only used for flat inputs */ + unsigned base = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + unsigned num_components = instr->num_components; + enum brw_reg_type type = dest.type; - switch (instr->intrinsic) { - case nir_intrinsic_interp_var_at_centroid: - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_CENTROID, - dst_xy, - fs_reg(), /* src */ - brw_imm_ud(0u), - interpolation); - break; + /* Special case fields in the VUE header */ + if (base == VARYING_SLOT_LAYER) + component = 1; + else if (base == VARYING_SLOT_VIEWPORT) + component = 2; - case nir_intrinsic_interp_var_at_sample: { - if (!wm_key->multisample_fbo) { - /* From the ARB_gpu_shader5 specification: - * "If multisample buffers are not available, the input varying - * will be evaluated at the center of the pixel." - */ - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_CENTROID, - dst_xy, - fs_reg(), /* src */ - brw_imm_ud(0u), - interpolation); - break; - } + if (nir_dest_bit_size(instr->dest) == 64) { + /* const_index is in 32-bit type size units that could not be aligned + * with DF. We need to read the double vector as if it was a float + * vector of twice the number of components to fetch the right data. + */ + type = BRW_REGISTER_TYPE_F; + num_components *= 2; + } - nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); + for (unsigned int i = 0; i < num_components; i++) { + struct brw_reg interp = interp_reg(base, component + i); + interp = suboffset(interp, 3); + bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i), + retype(fs_reg(interp), type)); + } - if (const_sample) { - unsigned msg_data = const_sample->i32[0] << 4; + if (nir_dest_bit_size(instr->dest) == 64) { + shuffle_32bit_load_result_to_64bit_data(bld, + dest, + retype(dest, type), + instr->num_components); + } + break; + } + + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + /* Do nothing - load_interpolated_input handling will handle it later. */ + break; + case nir_intrinsic_load_barycentric_at_sample: { + const glsl_interp_mode interpolation = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + + nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); + + if (const_sample) { + unsigned msg_data = const_sample->i32[0] << 4; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dest, + fs_reg(), /* src */ + brw_imm_ud(msg_data), + interpolation); + } else { + const fs_reg sample_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + if (nir_src_is_dynamically_uniform(instr->src[0])) { + const fs_reg sample_id = bld.emit_uniformize(sample_src); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0) + .SHL(msg_data, sample_id, brw_imm_ud(4u)); emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dst_xy, + dest, fs_reg(), /* src */ - brw_imm_ud(msg_data), + msg_data, interpolation); } else { - const fs_reg sample_src = retype(get_nir_src(instr->src[0]), - BRW_REGISTER_TYPE_UD); - - if (nir_src_is_dynamically_uniform(instr->src[0])) { - const fs_reg sample_id = bld.emit_uniformize(sample_src); - const fs_reg msg_data = vgrf(glsl_type::uint_type); - bld.exec_all().group(1, 0) - .SHL(msg_data, sample_id, brw_imm_ud(4u)); + /* Make a loop that sends a message to the pixel interpolater + * for the sample number in each live channel. If there are + * multiple channels with the same sample number then these + * will be handled simultaneously with a single interation of + * the loop. + */ + bld.emit(BRW_OPCODE_DO); + + /* Get the next live sample number into sample_id_reg */ + const fs_reg sample_id = bld.emit_uniformize(sample_src); + + /* Set the flag register so that we can perform the send + * message on all channels that have the same sample number + */ + bld.CMP(bld.null_reg_ud(), + sample_src, sample_id, + BRW_CONDITIONAL_EQ); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0) + .SHL(msg_data, sample_id, brw_imm_ud(4u)); + fs_inst *inst = emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dst_xy, + dest, fs_reg(), /* src */ msg_data, interpolation); - } else { - /* Make a loop that sends a message to the pixel interpolater - * for the sample number in each live channel. If there are - * multiple channels with the same sample number then these - * will be handled simultaneously with a single interation of - * the loop. - */ - bld.emit(BRW_OPCODE_DO); - - /* Get the next live sample number into sample_id_reg */ - const fs_reg sample_id = bld.emit_uniformize(sample_src); + set_predicate(BRW_PREDICATE_NORMAL, inst); - /* Set the flag register so that we can perform the send - * message on all channels that have the same sample number - */ - bld.CMP(bld.null_reg_ud(), - sample_src, sample_id, - BRW_CONDITIONAL_EQ); - const fs_reg msg_data = vgrf(glsl_type::uint_type); - bld.exec_all().group(1, 0) - .SHL(msg_data, sample_id, brw_imm_ud(4u)); - fs_inst *inst = - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dst_xy, - fs_reg(), /* src */ - msg_data, - interpolation); - set_predicate(BRW_PREDICATE_NORMAL, inst); - - /* Continue the loop if there are any live channels left */ - set_predicate_inv(BRW_PREDICATE_NORMAL, - true, /* inverse */ - bld.emit(BRW_OPCODE_WHILE)); - } + /* Continue the loop if there are any live channels left */ + set_predicate_inv(BRW_PREDICATE_NORMAL, + true, /* inverse */ + bld.emit(BRW_OPCODE_WHILE)); } - - break; } + break; + } - case nir_intrinsic_interp_var_at_offset: { - nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + case nir_intrinsic_load_barycentric_at_offset: { + const glsl_interp_mode interpolation = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); - if (const_offset) { - unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf; - unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - dst_xy, - fs_reg(), /* src */ - brw_imm_ud(off_x | (off_y << 4)), - interpolation); - } else { - fs_reg src = vgrf(glsl_type::ivec2_type); - fs_reg offset_src = retype(get_nir_src(instr->src[0]), - BRW_REGISTER_TYPE_F); - for (int i = 0; i < 2; i++) { - fs_reg temp = vgrf(glsl_type::float_type); - bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); - fs_reg itemp = vgrf(glsl_type::int_type); - /* float to int */ - bld.MOV(itemp, temp); - - /* Clamp the upper end of the range to +7/16. - * ARB_gpu_shader5 requires that we support a maximum offset - * of +0.5, which isn't representable in a S0.4 value -- if - * we didn't clamp it, we'd end up with -8/16, which is the - * opposite of what the shader author wanted. - * - * This is legal due to ARB_gpu_shader5's quantization - * rules: - * - * "Not all values of may be supported; x and y - * offsets may be rounded to fixed-point values with the - * number of fraction bits given by the - * implementation-dependent constant - * FRAGMENT_INTERPOLATION_OFFSET_BITS" - */ - set_condmod(BRW_CONDITIONAL_L, - bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); - } + if (const_offset) { + unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf; + unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf; - const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; - emit_pixel_interpolater_send(bld, - opcode, - dst_xy, - src, - brw_imm_ud(0u), - interpolation); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + dest, + fs_reg(), /* src */ + brw_imm_ud(off_x | (off_y << 4)), + interpolation); + } else { + fs_reg src = vgrf(glsl_type::ivec2_type); + fs_reg offset_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_F); + for (int i = 0; i < 2; i++) { + fs_reg temp = vgrf(glsl_type::float_type); + bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); + fs_reg itemp = vgrf(glsl_type::int_type); + /* float to int */ + bld.MOV(itemp, temp); + + /* Clamp the upper end of the range to +7/16. + * ARB_gpu_shader5 requires that we support a maximum offset + * of +0.5, which isn't representable in a S0.4 value -- if + * we didn't clamp it, we'd end up with -8/16, which is the + * opposite of what the shader author wanted. + * + * This is legal due to ARB_gpu_shader5's quantization + * rules: + * + * "Not all values of may be supported; x and y + * offsets may be rounded to fixed-point values with the + * number of fraction bits given by the + * implementation-dependent constant + * FRAGMENT_INTERPOLATION_OFFSET_BITS" + */ + set_condmod(BRW_CONDITIONAL_L, + bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); } + + const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; + emit_pixel_interpolater_send(bld, + opcode, + dest, + src, + brw_imm_ud(0u), + interpolation); + } + break; + } + + case nir_intrinsic_load_interpolated_input: { + if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { + emit_fragcoord_interpolation(dest); break; } - default: - unreachable("Invalid intrinsic"); + assert(instr->src[0].ssa && + instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *bary_intrinsic = + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; + enum glsl_interp_mode interp_mode = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); + fs_reg dst_xy; + + if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || + bary_intrin == nir_intrinsic_load_barycentric_at_sample) { + /* Use the result of the PI message */ + dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); + } else { + /* Use the delta_xy values computed from the payload */ + enum brw_barycentric_mode bary = + brw_barycentric_mode(interp_mode, bary_intrin); + + dst_xy = this->delta_xy[bary]; } - for (unsigned j = 0; j < instr->num_components; j++) { - fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); - src.type = dest.type; + for (unsigned int i = 0; i < instr->num_components; i++) { + fs_reg interp = + fs_reg(interp_reg(nir_intrinsic_base(instr), + nir_intrinsic_component(instr) + i)); + interp.type = BRW_REGISTER_TYPE_F; + dest.type = BRW_REGISTER_TYPE_F; - bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src); - dest = offset(dest, bld, 1); + if (devinfo->needs_unlit_centroid_workaround && + bary_intrin == nir_intrinsic_load_barycentric_centroid) { + + /* Get the pixel/sample mask into f0 so that we know which + * pixels are lit. Then, for each channel that is unlit, + * replace the centroid data with non-centroid data. + */ + bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + + fs_reg dest_i = offset(dest, bld, i); + fs_reg dst_xy_pixel = + delta_xy[brw_barycentric_mode(interp_mode, + nir_intrinsic_load_barycentric_pixel)]; + + fs_inst *inst; + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel, interp); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + inst->no_dd_clear = true; + + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy, interp); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + inst->no_dd_check = true; + } else if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) { + fs_reg tmp = vgrf(glsl_type::float_type); + bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); + bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); + } else { + bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); + } } break; } + default: nir_emit_intrinsic(bld, instr); break; @@ -3947,26 +3969,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_load_input: { - fs_reg src; + fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type); unsigned num_components = instr->num_components; enum brw_reg_type type = dest.type; - if (stage == MESA_SHADER_VERTEX) { - src = fs_reg(ATTR, instr->const_index[0], dest.type); - } else { - assert(type_sz(type) >= 4); - if (type == BRW_REGISTER_TYPE_DF) { - /* const_index is in 32-bit type size units that could not be aligned - * with DF. We need to read the double vector as if it was a float - * vector of twice the number of components to fetch the right data. - */ - dest = retype(dest, BRW_REGISTER_TYPE_F); - num_components *= 2; - } - src = offset(retype(nir_inputs, dest.type), bld, - instr->const_index[0]); - } - nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); assert(const_offset && "Indirect input loads not allowed"); src = offset(src, bld, const_offset->u32[0]); diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 6c3e1d184e4..fe7653137ea 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -30,7 +30,8 @@ static bool is_input(nir_intrinsic_instr *intrin) { return intrin->intrinsic == nir_intrinsic_load_input || - intrin->intrinsic == nir_intrinsic_load_per_vertex_input; + intrin->intrinsic == nir_intrinsic_load_per_vertex_input || + intrin->intrinsic == nir_intrinsic_load_interpolated_input; } static bool @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map) void brw_nir_lower_fs_inputs(nir_shader *nir) { - nir_assign_var_locations(&nir->inputs, &nir->num_inputs, VARYING_SLOT_VAR0, - type_size_scalar); - nir_lower_io(nir, nir_var_shader_in, type_size_scalar); + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + add_const_offset_to_base(nir, nir_var_shader_in); } void -- 2.30.2