From 8e7d066682e8c0ca59fe5d550cd5bbd2564d1776 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 11 Apr 2019 14:12:58 -0500 Subject: [PATCH] intel/fs: Actually implement the load_barycentric intrinsics If they never get used, dead code should clean them up. Also, we rework the at_offset and at_sample intrinsics so they return a proper vec2 instead of returning things in PLN layout. Fortunately, copy-prop is pretty good at cleaning this up and it doesn't result in any actual extra MOVs. Reviewed-by: Matt Turner --- src/intel/compiler/brw_fs.cpp | 33 ++++++++++++-- src/intel/compiler/brw_fs_nir.cpp | 72 +++++++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 4151ed7485e..36d8191ee46 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -7743,6 +7743,27 @@ fs_visitor::run_cs(unsigned min_dispatch_width) return !failed; } +static bool +is_used_in_not_interp_frag_coord(nir_ssa_def *def) +{ + nir_foreach_use(src, def) { + if (src->parent_instr->type != nir_instr_type_intrinsic) + return true; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr); + if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) + return true; + + if (nir_intrinsic_base(intrin) != VARYING_SLOT_POS) + return true; + } + + nir_foreach_if_use(src, def) + return true; + + return false; +} + /** * Return a bitfield where bit n is set if barycentric interpolation mode n * (see enum brw_barycentric_mode) is needed by the fragment shader. @@ -7767,14 +7788,20 @@ brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo, continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) + switch (intrin->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + break; + default: continue; + } /* Ignore WPOS; it doesn't require interpolation. */ - if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS) + assert(intrin->dest.is_ssa); + if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa)) continue; - intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); enum glsl_interp_mode interp = (enum glsl_interp_mode) nir_intrinsic_interp_mode(intrin); nir_intrinsic_op bary_op = intrin->intrinsic; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 125bdc72032..a6729dd8ecb 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3434,6 +3434,44 @@ alloc_frag_output(fs_visitor *v, unsigned location) unreachable("Invalid location"); } +/* Annoyingly, we get the barycentrics into the shader in a layout that's + * optimized for PLN but it doesn't work nearly as well as one would like for + * manual interpolation. + */ +static void +shuffle_from_pln_layout(const fs_builder &bld, fs_reg dest, fs_reg pln_data) +{ + dest.type = BRW_REGISTER_TYPE_F; + pln_data.type = BRW_REGISTER_TYPE_F; + const fs_reg dest_u = offset(dest, bld, 0); + const fs_reg dest_v = offset(dest, bld, 1); + + for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { + const fs_builder gbld = bld.group(8, g); + gbld.MOV(horiz_offset(dest_u, g * 8), + byte_offset(pln_data, (g * 2 + 0) * REG_SIZE)); + gbld.MOV(horiz_offset(dest_v, g * 8), + byte_offset(pln_data, (g * 2 + 1) * REG_SIZE)); + } +} + +static void +shuffle_to_pln_layout(const fs_builder &bld, fs_reg pln_data, fs_reg src) +{ + pln_data.type = BRW_REGISTER_TYPE_F; + src.type = BRW_REGISTER_TYPE_F; + const fs_reg src_u = offset(src, bld, 0); + const fs_reg src_v = offset(src, bld, 1); + + for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { + const fs_builder gbld = bld.group(8, g); + gbld.MOV(byte_offset(pln_data, (g * 2 + 0) * REG_SIZE), + horiz_offset(src_u, g * 8)); + gbld.MOV(byte_offset(pln_data, (g * 2 + 1) * REG_SIZE), + horiz_offset(src_v, g * 8)); + } +} + void fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3615,20 +3653,28 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - /* Do nothing - load_interpolated_input handling will handle it later. */ + case nir_intrinsic_load_barycentric_sample: { + /* Use the delta_xy values computed from the payload */ + const glsl_interp_mode interp_mode = + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + enum brw_barycentric_mode bary = + brw_barycentric_mode(interp_mode, instr->intrinsic); + + shuffle_from_pln_layout(bld, dest, this->delta_xy[bary]); break; + } case nir_intrinsic_load_barycentric_at_sample: { const glsl_interp_mode interpolation = (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (nir_src_is_const(instr->src[0])) { unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dest, + tmp, fs_reg(), /* src */ brw_imm_ud(msg_data), interpolation); @@ -3643,7 +3689,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, .SHL(msg_data, sample_id, brw_imm_ud(4u)); emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dest, + tmp, fs_reg(), /* src */ msg_data, interpolation); @@ -3671,7 +3717,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, fs_inst *inst = emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dest, + tmp, fs_reg(), /* src */ component(msg_data, 0), interpolation); @@ -3683,6 +3729,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, bld.emit(BRW_OPCODE_WHILE)); } } + shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3692,6 +3739,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (const_offset) { assert(nir_src_bit_size(instr->src[0]) == 32); unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; @@ -3699,7 +3747,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - dest, + tmp, fs_reg(), /* src */ brw_imm_ud(off_x | (off_y << 4)), interpolation); @@ -3736,11 +3784,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; emit_pixel_interpolater_send(bld, opcode, - dest, + tmp, src, brw_imm_ud(0u), interpolation); } + shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3761,8 +3810,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || bary_intrin == nir_intrinsic_load_barycentric_at_sample) { - /* Use the result of the PI message */ - dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); + /* Use the result of the PI message. Because the load_barycentric + * intrinsics return a regular vec2 and we need it in PLN layout, we + * have to do a translation. Fortunately, copy-prop cleans this up + * reliably. + */ + dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + shuffle_to_pln_layout(bld, dst_xy, get_nir_src(instr->src[0])); } else { /* Use the delta_xy values computed from the payload */ enum brw_barycentric_mode bary = -- 2.30.2