From: Eric Anholt Date: Fri, 22 Feb 2019 22:26:26 +0000 (-0800) Subject: v3d: Move the stores for fixed function VS output reads into NIR. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2780a99ff80cf84f887e8a1dca0079271f90f947;p=mesa.git v3d: Move the stores for fixed function VS output reads into NIR. This lets us emit the VPM_WRITEs directly from nir_intrinsic_store_output() (useful once NIR scheduling is in place so that we can reduce register pressure), and lets future NIR scheduling schedule the math to generate them. Even in the meantime, it looks like this lets NIR DCE some more code and make better decisions. total instructions in shared programs: 6429246 -> 6412976 (-0.25%) total threads in shared programs: 153924 -> 153934 (<.01%) total loops in shared programs: 486 -> 483 (-0.62%) total uniforms in shared programs: 2385436 -> 2388195 (0.12%) Acked-by: Ian Romanick (nir) --- diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index f5729ffa238..8faca1502bf 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1284,151 +1284,19 @@ emit_frag_end(struct v3d_compile *c) } static void -vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index) +vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { if (c->devinfo->ver >= 40) { - vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val); - *vpm_index = *vpm_index + 1; + vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); } else { + /* XXX: v3d33_vir_vpm_write_setup(c); */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); } - - c->num_vpm_writes++; -} - -static void -emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w, - uint32_t *vpm_index) -{ - for (int i = 0; i < 2; i++) { - struct qreg coord = c->outputs[c->output_position_index + i]; - coord = vir_FMUL(c, coord, - vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, - 0)); - coord = vir_FMUL(c, coord, rcp_w); - vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index); - } - -} - -static void -emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index) -{ - struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); - struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); - - struct qreg z = c->outputs[c->output_position_index + 2]; - z = vir_FMUL(c, z, zscale); - z = vir_FMUL(c, z, rcp_w); - z = vir_FADD(c, z, zoffset); - vir_VPM_WRITE(c, z, vpm_index); -} - -static void -emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index) -{ - vir_VPM_WRITE(c, rcp_w, vpm_index); -} - -static void -emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index) -{ - struct qreg point_size; - - if (c->output_point_size_index != -1) - point_size = c->outputs[c->output_point_size_index]; - else - point_size = vir_uniform_f(c, 1.0); - - /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, - * BCM21553). - */ - point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125)); - - vir_VPM_WRITE(c, point_size, vpm_index); -} - -static void -emit_vpm_write_setup(struct v3d_compile *c) -{ - if (c->devinfo->ver >= 40) - return; - - v3d33_vir_vpm_write_setup(c); -} - -/** - * Sets up c->outputs[c->output_position_index] for the vertex shader - * epilogue, if an output vertex position wasn't specified in the user's - * shader. This may be the case for transform feedback with rasterizer - * discard enabled. - */ -static void -setup_default_position(struct v3d_compile *c) -{ - if (c->output_position_index != -1) - return; - - c->output_position_index = c->outputs_array_size; - for (int i = 0; i < 4; i++) { - add_output(c, - c->output_position_index + i, - VARYING_SLOT_POS, i); - } } static void emit_vert_end(struct v3d_compile *c) { - setup_default_position(c); - - uint32_t vpm_index = 0; - struct qreg rcp_w = vir_RECIP(c, - c->outputs[c->output_position_index + 3]); - - emit_vpm_write_setup(c); - - if (c->vs_key->is_coord) { - for (int i = 0; i < 4; i++) - vir_VPM_WRITE(c, c->outputs[c->output_position_index + i], - &vpm_index); - emit_scaled_viewport_write(c, rcp_w, &vpm_index); - if (c->vs_key->per_vertex_point_size) { - emit_point_size_write(c, &vpm_index); - /* emit_rcp_wc_write(c, rcp_w); */ - } - /* XXX: Z-only rendering */ - if (0) - emit_zs_write(c, rcp_w, &vpm_index); - } else { - emit_scaled_viewport_write(c, rcp_w, &vpm_index); - emit_zs_write(c, rcp_w, &vpm_index); - emit_rcp_wc_write(c, rcp_w, &vpm_index); - if (c->vs_key->per_vertex_point_size) - emit_point_size_write(c, &vpm_index); - } - - for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { - struct v3d_varying_slot input = c->vs_key->fs_inputs[i]; - int j; - - for (j = 0; j < c->num_outputs; j++) { - struct v3d_varying_slot output = c->output_slots[j]; - - if (!memcmp(&input, &output, sizeof(input))) { - vir_VPM_WRITE(c, c->outputs[j], - &vpm_index); - break; - } - } - /* Emit padding if we didn't find a declared VS output for - * this FS input. - */ - if (j == c->num_outputs) - vir_VPM_WRITE(c, vir_uniform_f(c, 0.0), - &vpm_index); - } - /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) @@ -1619,6 +1487,9 @@ ntq_setup_fs_inputs(struct v3d_compile *c) static void ntq_setup_outputs(struct v3d_compile *c) { + if (c->s->info.stage != MESA_SHADER_FRAGMENT) + return; + nir_foreach_variable(var, &c->s->outputs) { unsigned array_len = MAX2(glsl_get_length(var->type), 1); unsigned loc = var->data.driver_location * 4; @@ -1632,37 +1503,26 @@ ntq_setup_outputs(struct v3d_compile *c) var->data.location_frac + i); } - if (c->s->info.stage == MESA_SHADER_FRAGMENT) { - switch (var->data.location) { - case FRAG_RESULT_COLOR: - c->output_color_var[0] = var; - c->output_color_var[1] = var; - c->output_color_var[2] = var; - c->output_color_var[3] = var; - break; - case FRAG_RESULT_DATA0: - case FRAG_RESULT_DATA1: - case FRAG_RESULT_DATA2: - case FRAG_RESULT_DATA3: - c->output_color_var[var->data.location - - FRAG_RESULT_DATA0] = var; - break; - case FRAG_RESULT_DEPTH: - c->output_position_index = loc; - break; - case FRAG_RESULT_SAMPLE_MASK: - c->output_sample_mask_index = loc; - break; - } - } else { - switch (var->data.location) { - case VARYING_SLOT_POS: - c->output_position_index = loc; - break; - case VARYING_SLOT_PSIZ: - c->output_point_size_index = loc; - break; - } + switch (var->data.location) { + case FRAG_RESULT_COLOR: + c->output_color_var[0] = var; + c->output_color_var[1] = var; + c->output_color_var[2] = var; + c->output_color_var[3] = var; + break; + case FRAG_RESULT_DATA0: + case FRAG_RESULT_DATA1: + case FRAG_RESULT_DATA2: + case FRAG_RESULT_DATA3: + c->output_color_var[var->data.location - + FRAG_RESULT_DATA0] = var; + break; + case FRAG_RESULT_DEPTH: + c->output_position_index = loc; + break; + case FRAG_RESULT_SAMPLE_MASK: + c->output_sample_mask_index = loc; + break; } } } @@ -1842,6 +1702,26 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } break; + case nir_intrinsic_load_viewport_x_scale: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); + break; + + case nir_intrinsic_load_viewport_y_scale: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); + break; + + case nir_intrinsic_load_viewport_z_scale: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); + break; + + case nir_intrinsic_load_viewport_z_offset: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); + break; + case nir_intrinsic_load_alpha_ref_float: ntq_store_dest(c, &instr->dest, 0, vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); @@ -1919,16 +1799,23 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_store_output: - offset = ((nir_intrinsic_base(instr) + - nir_src_as_uint(instr->src[1])) * 4 + - nir_intrinsic_component(instr)); + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + offset = ((nir_intrinsic_base(instr) + + nir_src_as_uint(instr->src[1])) * 4 + + nir_intrinsic_component(instr)); + for (int i = 0; i < instr->num_components; i++) { + c->outputs[offset + i] = + vir_MOV(c, + ntq_get_src(c, + instr->src[0], i)); + } + } else { + assert(instr->num_components == 1); - for (int i = 0; i < instr->num_components; i++) { - c->outputs[offset + i] = - vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + vir_VPM_WRITE(c, + ntq_get_src(c, instr->src[0], 0), + nir_intrinsic_base(instr)); } - c->num_outputs = MAX2(c->num_outputs, - offset + instr->num_components); break; case nir_intrinsic_image_deref_size: diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 11d4cc3b7b1..c8b995b3b4f 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -563,7 +563,7 @@ struct v3d_compile { int local_invocation_index_bits; uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; - uint32_t num_vpm_writes; + uint32_t vpm_output_size; /* Size in bytes of registers that have been spilled. This is how much * space needs to be available in the spill BO per thread per QPU. @@ -607,10 +607,8 @@ struct v3d_compile { enum quniform_contents *uniform_contents; uint32_t uniform_array_size; uint32_t num_uniforms; - uint32_t num_outputs; uint32_t output_position_index; nir_variable *output_color_var[4]; - uint32_t output_point_size_index; uint32_t output_sample_mask_index; struct qreg undef; diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c index b65a82b7f7a..2a68efb7b6b 100644 --- a/src/broadcom/compiler/v3d_nir_lower_io.c +++ b/src/broadcom/compiler/v3d_nir_lower_io.c @@ -28,11 +28,47 @@ * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io * intrinsics into something amenable to the V3D architecture. * - * After moving more and more logic to NIR, all that's left here is fixing up - * addressing on uniform loads. FS input and VS output scalarization is - * handled by nir_lower_io_to_scalar(). + * Most of the work is turning the VS's store_output intrinsics from working + * on a base representing the gallium-level vec4 driver_location to an offset + * within the VPM, and emitting the header that's read by the fixed function + * hardware between the VS and FS. + * + * We also adjust the offsets on uniform loads to be in bytes, since that's + * what we need for indirect addressing with general TMU access. */ +struct v3d_nir_lower_io_state { + int pos_vpm_offset; + int vp_vpm_offset; + int zs_vpm_offset; + int rcp_wc_vpm_offset; + int psiz_vpm_offset; + int varyings_vpm_offset; + + BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + nir_ssa_def *pos[4]; +}; + +static void +v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan) +{ + nir_intrinsic_instr *intr = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output); + nir_ssa_dest_init(&intr->instr, &intr->dest, + 1, intr->dest.ssa.bit_size, NULL); + intr->num_components = 1; + + intr->src[0] = nir_src_for_ssa(chan); + intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); + + nir_intrinsic_set_base(intr, base); + nir_intrinsic_set_write_mask(intr, 0x1); + nir_intrinsic_set_component(intr, 0); + + nir_builder_instr_insert(b, &intr->instr); +} + /* Convert the uniform offset to bytes. If it happens to be a constant, * constant-folding will clean up the shift for us. */ @@ -50,9 +86,90 @@ v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b, nir_imm_int(b, 4)))); } +static int +v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan) +{ + int component = var->data.location_frac + chan; + + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + struct v3d_varying_slot slot = c->vs_key->fs_inputs[i]; + + if (v3d_slot_get_slot(slot) == var->data.location && + v3d_slot_get_component(slot) == component) { + return i; + } + } + + return -1; +} + +/* Lowers a store_output(gallium driver location) to a series of store_outputs + * with a driver_location equal to the offset in the VPM. + */ +static void +v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *intr, + struct v3d_nir_lower_io_state *state) +{ + b->cursor = nir_before_instr(&intr->instr); + + int start_comp = nir_intrinsic_component(intr); + nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0], + intr->num_components); + + nir_variable *var = NULL; + nir_foreach_variable(scan_var, &c->s->outputs) { + if (scan_var->data.driver_location != nir_intrinsic_base(intr) || + start_comp < scan_var->data.location_frac || + start_comp >= scan_var->data.location_frac + + glsl_get_components(scan_var->type)) { + continue; + } + var = scan_var; + } + + /* Save off the components of the position for the setup of VPM inputs + * read by fixed function HW. + */ + if (var->data.location == VARYING_SLOT_POS) { + for (int i = 0; i < intr->num_components; i++) { + state->pos[start_comp + i] = nir_channel(b, src, i); + } + } + + /* Just psiz to the position in the FF header right now. */ + if (var->data.location == VARYING_SLOT_PSIZ && + state->psiz_vpm_offset != -1) { + v3d_nir_store_output(b, state->psiz_vpm_offset, src); + } + + /* Scalarize outputs if it hasn't happened already, since we want to + * schedule each VPM write individually. We can skip any outut + * components not read by the FS. + */ + for (int i = 0; i < intr->num_components; i++) { + int vpm_offset = + v3d_varying_slot_vpm_offset(c, var, + i + + start_comp - + var->data.location_frac); + + if (vpm_offset == -1) + continue; + + BITSET_SET(state->varyings_stored, vpm_offset); + + v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset, + nir_channel(b, src, i)); + } + + nir_instr_remove(&intr->instr); +} + static void v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, - struct nir_instr *instr) + struct nir_instr *instr, + struct v3d_nir_lower_io_state *state) { if (instr->type != nir_instr_type_intrinsic) return; @@ -63,33 +180,171 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, v3d_nir_lower_uniform(c, b, intr); break; + case nir_intrinsic_store_output: + if (c->s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_lower_vpm_output(c, b, intr, state); + break; + default: break; } } -static bool -v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl) +/* Remap the output var's .driver_location. This is purely for + * nir_print_shader() so that store_output can map back to a variable name. + */ +static void +v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) +{ + nir_foreach_variable_safe(var, &c->s->outputs) { + if (var->data.location == VARYING_SLOT_POS && + state->pos_vpm_offset != -1) { + var->data.driver_location = state->pos_vpm_offset; + continue; + } + + if (var->data.location == VARYING_SLOT_PSIZ && + state->psiz_vpm_offset != -1) { + var->data.driver_location = state->psiz_vpm_offset; + continue; + } + + int vpm_offset = v3d_varying_slot_vpm_offset(c, var, 0); + if (vpm_offset != -1) { + var->data.driver_location = + state->varyings_vpm_offset + vpm_offset; + } else { + /* If we couldn't find a mapping for the var, delete + * it so that its old .driver_location doesn't confuse + * nir_print_shader(). + */ + exec_node_remove(&var->node); + } + } +} + +static void +v3d_nir_setup_vpm_layout(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) +{ + uint32_t vpm_offset = 0; + + if (c->vs_key->is_coord) { + state->pos_vpm_offset = vpm_offset; + vpm_offset += 4; + } else { + state->pos_vpm_offset = -1; + } + + state->vp_vpm_offset = vpm_offset; + vpm_offset += 2; + + if (!c->vs_key->is_coord) { + state->zs_vpm_offset = vpm_offset++; + state->rcp_wc_vpm_offset = vpm_offset++; + } else { + state->zs_vpm_offset = -1; + state->rcp_wc_vpm_offset = -1; + } + + if (c->vs_key->per_vertex_point_size) + state->psiz_vpm_offset = vpm_offset++; + else + state->psiz_vpm_offset = -1; + + state->varyings_vpm_offset = vpm_offset; + + c->vpm_output_size = vpm_offset + c->vs_key->num_fs_inputs; +} + +static void +v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, + struct v3d_nir_lower_io_state *state) { - nir_builder b; - nir_builder_init(&b, impl); + for (int i = 0; i < 4; i++) { + if (!state->pos[i]) + state->pos[i] = nir_ssa_undef(b, 1, 32); + } + + nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]); + + if (state->pos_vpm_offset != -1) { + for (int i = 0; i < 4; i++) { + v3d_nir_store_output(b, state->pos_vpm_offset + i, + state->pos[i]); + } + } - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) - v3d_nir_lower_io_instr(c, &b, instr); + for (int i = 0; i < 2; i++) { + nir_ssa_def *pos; + nir_ssa_def *scale; + pos = state->pos[i]; + if (i == 0) + scale = nir_load_viewport_x_scale(b); + else + scale = nir_load_viewport_y_scale(b); + pos = nir_fmul(b, pos, scale); + pos = nir_fmul(b, pos, rcp_wc); + pos = nir_f2i32(b, nir_fround_even(b, pos)); + v3d_nir_store_output(b, state->vp_vpm_offset + i, + pos); } - nir_metadata_preserve(impl, nir_metadata_block_index | - nir_metadata_dominance); + if (state->zs_vpm_offset != -1) { + nir_ssa_def *z = state->pos[2]; + z = nir_fmul(b, z, nir_load_viewport_z_scale(b)); + z = nir_fmul(b, z, rcp_wc); + z = nir_fadd(b, z, nir_load_viewport_z_offset(b)); + v3d_nir_store_output(b, state->zs_vpm_offset, z); + } + + if (state->rcp_wc_vpm_offset != -1) + v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc); - return true; + /* Store 0 to varyings requested by the FS but not stored in the VS. + * This should be undefined behavior, but glsl-routing seems to rely + * on it. + */ + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + if (!BITSET_TEST(state->varyings_stored, i)) { + v3d_nir_store_output(b, state->varyings_vpm_offset + i, + nir_imm_int(b, 0)); + } + } } void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) { + struct v3d_nir_lower_io_state state = { 0 }; + + /* Set up the layout of the VPM outputs. */ + if (s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_setup_vpm_layout(c, &state); + nir_foreach_function(function, s) { - if (function->impl) - v3d_nir_lower_io_impl(c, function->impl); + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) + v3d_nir_lower_io_instr(c, &b, instr, + &state); + } + + nir_block *last = nir_impl_last_block(function->impl); + b.cursor = nir_after_block(last); + if (s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_emit_ff_vpm_outputs(c, &b, &state); + + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } } + + if (s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_lower_io_update_output_var_base(c, &state); } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 2f670fc9024..559d449c437 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -566,7 +566,6 @@ vir_compile_init(const struct v3d_compiler *compiler, vir_set_emit_block(c, vir_new_block(c)); c->output_position_index = -1; - c->output_point_size_index = -1; c->output_sample_mask_index = -1; c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, @@ -695,7 +694,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c, * channel). */ prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; - prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8; + prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; /* Set us up for shared input/output segments. This is apparently * necessary for our VCM setup to avoid varying corruption. diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 9e765951301..1d388c64fc9 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -534,6 +534,15 @@ system_value("subgroup_id", 1) system_value("local_group_size", 3) system_value("global_invocation_id", 3) system_value("work_dim", 1) +# Driver-specific viewport scale/offset parameters. +# +# VC4 and V3D need to emit a scaled version of the position in the vertex +# shaders for binning, and having system values lets us move the math for that +# into NIR. +system_value("viewport_x_scale", 1) +system_value("viewport_y_scale", 1) +system_value("viewport_z_scale", 1) +system_value("viewport_z_offset", 1) # Blend constant color values. Float values are clamped.# system_value("blend_const_color_r_float", 1)