From 56ed835bffb0e9cd6770a788b6605b84bd54683c Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Tue, 22 Oct 2019 17:30:48 -0700 Subject: [PATCH] freedreno/ir3: Extend geometry lowering pass to handle tessellation VS and TCS pass varyings the same way as VS and GS does. TCS then writes entire patch to a system memory BO and TES eventually reads back from the BO once the TE starts generating vertices. TES outputs vertices the same way as VS and GS, except when there's a GS as well, in which case TES passes varyings to GS same way the VS would. In addition, the TCS needs a little bit of control flow massaging so that it only runs for valid invocations needs a couple of unknown instructions to synchronize with the TE. Signed-off-by: Kristian H. Kristensen Acked-by: Eric Anholt Reviewed-by: Rob Clark --- src/freedreno/ir3/ir3_nir.c | 12 +- src/freedreno/ir3/ir3_nir.h | 5 +- src/freedreno/ir3/ir3_nir_lower_tess.c | 511 ++++++++++++++++++++++++- 3 files changed, 520 insertions(+), 8 deletions(-) diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index ab092ff1eda..fc90fbe3868 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -189,10 +189,18 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, .lower_tg4_offsets = true, }; - if (key && key->has_gs) { + if (key && (key->has_gs || key->tessellation)) { switch (shader->type) { case MESA_SHADER_VERTEX: - NIR_PASS_V(s, ir3_nir_lower_vs_to_explicit_io, shader); + NIR_PASS_V(s, ir3_nir_lower_to_explicit_io, shader, key->tessellation); + break; + case MESA_SHADER_TESS_CTRL: + NIR_PASS_V(s, ir3_nir_lower_tess_ctrl, shader, key->tessellation); + break; + case MESA_SHADER_TESS_EVAL: + NIR_PASS_V(s, ir3_nir_lower_tess_eval, key->tessellation); + if (key->has_gs) + NIR_PASS_V(s, ir3_nir_lower_to_explicit_io, shader, key->tessellation); break; case MESA_SHADER_GEOMETRY: NIR_PASS_V(s, ir3_nir_lower_gs, shader); diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index dc693b3f556..a42c8822b4a 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -44,7 +44,10 @@ int ir3_nir_coord_offset(nir_ssa_def *ssa); bool ir3_nir_lower_tex_prefetch(nir_shader *shader); -void ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s); +void ir3_nir_lower_to_explicit_io(nir_shader *shader, + struct ir3_shader *s, unsigned topology); +void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topology); +void ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology); void ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s); const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler); diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c index acbb02d4108..27fc24c1a0d 100644 --- a/src/freedreno/ir3/ir3_nir_lower_tess.c +++ b/src/freedreno/ir3/ir3_nir_lower_tess.c @@ -26,6 +26,8 @@ #include "compiler/nir/nir_builder.h" struct state { + uint32_t topology; + struct primitive_map { unsigned loc[32]; unsigned size[32]; @@ -40,6 +42,9 @@ struct state { nir_variable *vertex_flags_out; nir_variable *output_vars[32]; + + nir_ssa_def *outer_levels[4]; + nir_ssa_def *inner_levels[2]; }; static nir_ssa_def * @@ -89,13 +94,18 @@ build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *attr_offset; nir_ssa_def *vertex_stride; - if (b->shader->info.stage == MESA_SHADER_VERTEX) { + switch (b->shader->info.stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: vertex_stride = nir_imm_int(b, state->map.stride * 4); attr_offset = nir_imm_int(b, state->map.loc[base] * 4); - } else if (b->shader->info.stage == MESA_SHADER_GEOMETRY) { + break; + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_GEOMETRY: vertex_stride = nir_load_vs_vertex_stride_ir3(b); attr_offset = nir_load_primitive_location_ir3(b, base); - } else { + break; + default: unreachable("bad shader stage"); } @@ -212,7 +222,7 @@ local_thread_id(nir_builder *b) } void -ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s) +ir3_nir_lower_to_explicit_io(nir_shader *shader, struct ir3_shader *s, unsigned topology) { struct state state = { }; @@ -226,7 +236,10 @@ ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s) nir_builder_init(&b, impl); b.cursor = nir_before_cf_list(&impl->body); - state.header = nir_load_gs_header_ir3(&b); + if (s->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE) + state.header = nir_load_tcs_header_ir3(&b); + else + state.header = nir_load_gs_header_ir3(&b); nir_foreach_block_safe(block, impl) lower_vs_block(block, &b, &state); @@ -237,6 +250,494 @@ ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s) s->output_size = state.map.stride; } +static nir_ssa_def * +build_per_vertex_offset(nir_builder *b, struct state *state, + nir_ssa_def *vertex, nir_ssa_def *offset, nir_variable *var) +{ + nir_ssa_def *primitive_id = nir_load_primitive_id(b); + nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b); + nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, patch_stride); + nir_ssa_def *attr_offset; + int loc = var->data.driver_location; + + switch (b->shader->info.stage) { + case MESA_SHADER_TESS_CTRL: + attr_offset = nir_imm_int(b, state->map.loc[loc]); + break; + case MESA_SHADER_TESS_EVAL: + attr_offset = nir_load_primitive_location_ir3(b, loc); + break; + default: + unreachable("bad shader state"); + } + + nir_ssa_def *attr_stride = nir_imm_int(b, state->map.size[loc]); + nir_ssa_def *vertex_offset = nir_imul24(b, vertex, attr_stride); + + return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), + nir_iadd(b, vertex_offset, nir_ishl(b, offset, nir_imm_int(b, 2)))); +} + +static nir_ssa_def * +build_patch_offset(nir_builder *b, struct state *state, nir_ssa_def *offset, nir_variable *var) +{ + debug_assert(var && var->data.patch); + + return build_per_vertex_offset(b, state, nir_imm_int(b, 0), offset, var); +} + +static nir_ssa_def * +build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state) +{ + uint32_t inner_levels, outer_levels; + switch (state->topology) { + case IR3_TESS_TRIANGLES: + inner_levels = 1; + outer_levels = 3; + break; + case IR3_TESS_QUADS: + inner_levels = 2; + outer_levels = 4; + break; + case IR3_TESS_ISOLINES: + inner_levels = 0; + outer_levels = 2; + break; + default: + unreachable("bad"); + } + + const uint32_t patch_stride = 1 + inner_levels + outer_levels; + + nir_ssa_def *primitive_id = nir_load_primitive_id(b); + + nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, nir_imm_int(b, patch_stride)); + + uint32_t offset; + switch (slot) { + case VARYING_SLOT_TESS_LEVEL_OUTER: + /* There's some kind of header dword, tess levels start at index 1. */ + offset = 1; + break; + case VARYING_SLOT_TESS_LEVEL_INNER: + offset = 1 + outer_levels; + break; + default: + unreachable("bad"); + } + + return nir_iadd(b, patch_offset, nir_imm_int(b, offset)); +} + +static void +lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state) +{ + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_invocation_id: + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *invocation_id = build_invocation_id(b, state); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, + nir_src_for_ssa(invocation_id)); + nir_instr_remove(&intr->instr); + break; + + case nir_intrinsic_barrier: + /* Hull shaders dispatch 32 wide so an entire patch will always + * fit in a single warp and execute in lock-step. Consequently, + * we don't need to do anything for TCS barriers so just remove + * the intrinsic. Otherwise we'll emit an actual barrier + * instructions, which will deadlock. + */ + nir_instr_remove(&intr->instr); + break; + + case nir_intrinsic_load_per_vertex_output: { + // src[] = { vertex, offset }. + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *address = nir_load_tess_param_base_ir3(b); + nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + nir_ssa_def *offset = build_per_vertex_offset(b, state, + intr->src[0].ssa, intr->src[1].ssa, var); + + replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); + break; + } + + case nir_intrinsic_store_per_vertex_output: { + // src[] = { value, vertex, offset }. + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *value = intr->src[0].ssa; + nir_ssa_def *address = nir_load_tess_param_base_ir3(b); + nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + nir_ssa_def *offset = build_per_vertex_offset(b, state, + intr->src[1].ssa, intr->src[2].ssa, var); + + nir_intrinsic_instr *store = + replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address, + nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)))); + + nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr)); + + break; + } + + case nir_intrinsic_load_per_vertex_input: { + // src[] = { vertex, offset }. + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *offset = build_local_offset(b, state, + intr->src[0].ssa, // this is typically gl_InvocationID + nir_intrinsic_base(intr), + intr->src[1].ssa); + + replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL); + break; + } + + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_tess_level_outer: { + b->cursor = nir_before_instr(&intr->instr); + + gl_varying_slot slot; + if (intr->intrinsic == nir_intrinsic_load_tess_level_inner) + slot = VARYING_SLOT_TESS_LEVEL_INNER; + else + slot = VARYING_SLOT_TESS_LEVEL_OUTER; + + nir_ssa_def *address = nir_load_tess_factor_base_ir3(b); + nir_ssa_def *offset = build_tessfactor_base(b, slot, state); + + replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); + break; + } + + case nir_intrinsic_load_output: { + // src[] = { offset }. + + nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *address = nir_load_tess_param_base_ir3(b); + nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var); + + replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); + break; + } + + case nir_intrinsic_store_output: { + // src[] = { value, offset }. + + /* write patch output to bo */ + + nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + + nir_ssa_def **levels = NULL; + if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) + levels = state->outer_levels; + else if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER) + levels = state->inner_levels; + + b->cursor = nir_before_instr(&intr->instr); + + if (levels) { + for (int i = 0; i < 4; i++) + if (nir_intrinsic_write_mask(intr) & (1 << i)) + levels[i] = nir_channel(b, intr->src[0].ssa, i); + nir_instr_remove(&intr->instr); + } else { + nir_ssa_def *address = nir_load_tess_param_base_ir3(b); + nir_ssa_def *offset = build_patch_offset(b, state, intr->src[1].ssa, var); + + debug_assert(nir_intrinsic_component(intr) == 0); + + nir_intrinsic_instr *store = + replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, + intr->src[0].ssa, address, offset); + + nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr)); + } + break; + } + + default: + break; + } + } +} + +static void +emit_tess_epilouge(nir_builder *b, struct state *state) +{ + nir_ssa_def *tessfactor_address = nir_load_tess_factor_base_ir3(b); + nir_ssa_def *levels[2]; + + /* Then emit the epilogue that actually writes out the tessellation levels + * to the BOs. + */ + switch (state->topology) { + case IR3_TESS_TRIANGLES: + levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1], + state->outer_levels[2], state->inner_levels[0]); + levels[1] = NULL; + break; + case IR3_TESS_QUADS: + levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1], + state->outer_levels[2], state->outer_levels[3]); + levels[1] = nir_vec2(b, state->inner_levels[0], state->inner_levels[1]); + break; + case IR3_TESS_ISOLINES: + levels[0] = nir_vec2(b, state->outer_levels[0], state->outer_levels[1]); + levels[1] = NULL; + break; + default: + unreachable("nope"); + } + + nir_ssa_def *offset = build_tessfactor_base(b, VARYING_SLOT_TESS_LEVEL_OUTER, state); + + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3); + + store->src[0] = nir_src_for_ssa(levels[0]); + store->src[1] = nir_src_for_ssa(tessfactor_address); + store->src[2] = nir_src_for_ssa(offset); + nir_builder_instr_insert(b, &store->instr); + store->num_components = levels[0]->num_components; + nir_intrinsic_set_write_mask(store, (1 << levels[0]->num_components) - 1); + + if (levels[1]) { + store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3); + offset = nir_iadd(b, offset, nir_imm_int(b, levels[0]->num_components)); + + store->src[0] = nir_src_for_ssa(levels[1]); + store->src[1] = nir_src_for_ssa(tessfactor_address); + store->src[2] = nir_src_for_ssa(offset); + nir_builder_instr_insert(b, &store->instr); + store->num_components = levels[1]->num_components; + nir_intrinsic_set_write_mask(store, (1 << levels[1]->num_components) - 1); + } + + /* Finally, Insert endpatch instruction, maybe signalling the tess engine + * that another primitive is ready? + */ + + nir_intrinsic_instr *end_patch = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_end_patch_ir3); + nir_builder_instr_insert(b, &end_patch->instr); +} + +void +ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topology) +{ + struct state state = { .topology = topology }; + + if (shader_debug_enabled(shader->info.stage)) { + fprintf(stderr, "NIR (before tess lowering) for %s shader:\n", + _mesa_shader_stage_to_string(shader->info.stage)); + nir_print_shader(shader, stderr); + } + + build_primitive_map(shader, &state.map, &shader->outputs); + memcpy(s->output_loc, state.map.loc, sizeof(s->output_loc)); + s->output_size = state.map.stride; + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + assert(impl); + + nir_builder b; + nir_builder_init(&b, impl); + b.cursor = nir_before_cf_list(&impl->body); + + state.header = nir_load_tcs_header_ir3(&b); + + nir_foreach_block_safe(block, impl) + lower_tess_ctrl_block(block, &b, &state); + + /* Now move the body of the TCS into a conditional: + * + * if (gl_InvocationID < num_vertices) + * // body + * + */ + + nir_cf_list body; + nir_cf_extract(&body, nir_before_cf_list(&impl->body), + nir_after_cf_list(&impl->body)); + + b.cursor = nir_after_cf_list(&impl->body); + + /* Re-emit the header, since the old one got moved into the if branch */ + state.header = nir_load_tcs_header_ir3(&b); + nir_ssa_def *iid = build_invocation_id(&b, &state); + + const uint32_t nvertices = shader->info.tess.tcs_vertices_out; + nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices)); + + nir_if *nif = nir_push_if(&b, cond); + + nir_cf_reinsert(&body, b.cursor); + + b.cursor = nir_after_cf_list(&nif->then_list); + + /* Insert conditional exit for threads invocation id != 0 */ + nir_ssa_def *iid0_cond = nir_ieq(&b, iid, nir_imm_int(&b, 0)); + nir_intrinsic_instr *cond_end = + nir_intrinsic_instr_create(shader, nir_intrinsic_cond_end_ir3); + cond_end->src[0] = nir_src_for_ssa(iid0_cond); + nir_builder_instr_insert(&b, &cond_end->instr); + + emit_tess_epilouge(&b, &state); + + nir_pop_if(&b, nif); + + nir_metadata_preserve(impl, 0); +} + + +static void +lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state) +{ + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_tess_coord: { + b->cursor = nir_after_instr(&intr->instr); + nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0); + nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1); + nir_ssa_def *z; + + if (state->topology == IR3_TESS_TRIANGLES) + z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x); + else + z = nir_imm_float(b, 0.0f); + + nir_ssa_def *coord = nir_vec3(b, x, y, z); + + nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, + nir_src_for_ssa(coord), + b->cursor.instr); + break; + } + + case nir_intrinsic_load_per_vertex_input: { + // src[] = { vertex, offset }. + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *address = nir_load_tess_param_base_ir3(b); + nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr)); + nir_ssa_def *offset = build_per_vertex_offset(b, state, + intr->src[0].ssa, intr->src[1].ssa, var); + + replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); + break; + } + + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_tess_level_outer: { + b->cursor = nir_before_instr(&intr->instr); + + gl_varying_slot slot; + if (intr->intrinsic == nir_intrinsic_load_tess_level_inner) + slot = VARYING_SLOT_TESS_LEVEL_INNER; + else + slot = VARYING_SLOT_TESS_LEVEL_OUTER; + + nir_ssa_def *address = nir_load_tess_factor_base_ir3(b); + nir_ssa_def *offset = build_tessfactor_base(b, slot, state); + + /* Loading across a vec4 (16b) memory boundary is problematic + * if we don't use components from the second vec4. The tess + * levels aren't guaranteed to be vec4 aligned and we don't + * know which levels are actually used, so we load each + * component individually. + */ + nir_ssa_def *levels[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *new_intr = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_global_ir3); + + new_intr->src[0] = nir_src_for_ssa(address); + new_intr->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i))); + new_intr->num_components = 1; + nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &new_intr->instr); + levels[i] = &new_intr->dest.ssa; + } + + nir_ssa_def *v = nir_vec(b, levels, intr->num_components); + + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(v)); + + nir_instr_remove(&intr->instr); + break; + } + + case nir_intrinsic_load_input: { + // src[] = { offset }. + + nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr)); + + debug_assert(var->data.patch); + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *address = nir_load_tess_param_base_ir3(b); + nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var); + + replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); + break; + } + + default: + break; + } + } +} + +void +ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology) +{ + struct state state = { .topology = topology }; + + if (shader_debug_enabled(shader->info.stage)) { + fprintf(stderr, "NIR (before tess lowering) for %s shader:\n", + _mesa_shader_stage_to_string(shader->info.stage)); + nir_print_shader(shader, stderr); + } + + /* Build map of inputs so we have the sizes. */ + build_primitive_map(shader, &state.map, &shader->inputs); + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + assert(impl); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block_safe(block, impl) + lower_tess_eval_block(block, &b, &state); + + nir_metadata_preserve(impl, 0); +} + static void lower_gs_block(nir_block *block, nir_builder *b, struct state *state) { -- 2.30.2