X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_nir_lower_tess.c;h=44b2921dcb7b97742ebc322442444f1968bed970;hb=2e1df6a17ff82c4a456caa8be4bfae1fac009b6a;hp=056b009ef752f38ec1dfdea06cb3898a58770b77;hpb=e40b11bbcb02dde1a8f989ca6545e22414c6f4ce;p=mesa.git diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c index 056b009ef75..44b2921dcb7 100644 --- a/src/freedreno/ir3/ir3_nir_lower_tess.c +++ b/src/freedreno/ir3/ir3_nir_lower_tess.c @@ -38,13 +38,14 @@ struct state { nir_variable *vertex_count_var; nir_variable *emitted_vertex_var; - nir_variable *vertex_flags_var; nir_variable *vertex_flags_out; - nir_variable *output_vars[32]; + struct exec_list old_outputs; + struct exec_list new_outputs; + struct exec_list emit_outputs; - nir_ssa_def *outer_levels[4]; - nir_ssa_def *inner_levels[2]; + /* tess ctrl shader on a650 gets the local primitive id at different bits: */ + unsigned local_primitive_id_start; }; static nir_ssa_def * @@ -69,13 +70,13 @@ build_vertex_id(nir_builder *b, struct state *state) static nir_ssa_def * build_local_primitive_id(nir_builder *b, struct state *state) { - return bitfield_extract(b, state->header, 0, 63); + return bitfield_extract(b, state->header, state->local_primitive_id_start, 63); } static nir_variable * -get_var(struct exec_list *list, int driver_location) +get_var(nir_shader *shader, nir_variable_mode mode, int driver_location) { - nir_foreach_variable(v, list) { + nir_foreach_variable_with_modes (v, shader, mode) { if (v->data.driver_location == driver_location) { return v; } @@ -84,6 +85,13 @@ get_var(struct exec_list *list, int driver_location) return NULL; } +static bool +is_tess_levels(nir_variable *var) +{ + return (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER || + var->data.location == VARYING_SLOT_TESS_LEVEL_INNER); +} + static nir_ssa_def * build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex, uint32_t base, nir_ssa_def *offset) @@ -145,9 +153,9 @@ replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr, } static void -build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_list *list) +build_primitive_map(nir_shader *shader, nir_variable_mode mode, struct primitive_map *map) { - nir_foreach_variable(var, list) { + nir_foreach_variable_with_modes (var, shader, mode) { switch (var->data.location) { case VARYING_SLOT_TESS_LEVEL_OUTER: case VARYING_SLOT_TESS_LEVEL_INNER: @@ -165,7 +173,7 @@ build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_l for (uint32_t i = 0; i < ARRAY_SIZE(map->size); i++) { if (map->size[i] == 0) continue; - nir_variable *var = get_var(list, i); + nir_variable *var = get_var(shader, mode, i); map->loc[i] = loc; loc += map->size[i]; @@ -179,9 +187,9 @@ build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_l } static void -lower_vs_block(nir_block *block, nir_builder *b, struct state *state) +lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *state) { - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe (instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; @@ -191,6 +199,13 @@ lower_vs_block(nir_block *block, nir_builder *b, struct state *state) case nir_intrinsic_store_output: { // src[] = { value, offset }. + /* nir_lower_io_to_temporaries replaces all access to output + * variables with temp variables and then emits a nir_copy_var at + * the end of the shader. Thus, we should always get a full wrmask + * here. + */ + assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); + b->cursor = nir_instr_remove(&intr->instr); nir_ssa_def *vertex_id = build_vertex_id(b, state); @@ -199,10 +214,8 @@ lower_vs_block(nir_block *block, nir_builder *b, struct state *state) nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3); - nir_intrinsic_set_write_mask(store, MASK(intr->num_components)); store->src[0] = nir_src_for_ssa(intr->src[0].ssa); store->src[1] = nir_src_for_ssa(offset); - store->num_components = intr->num_components; nir_builder_instr_insert(b, &store->instr); @@ -222,12 +235,13 @@ local_thread_id(nir_builder *b) } void -ir3_nir_lower_to_explicit_io(nir_shader *shader, struct ir3_shader *s, unsigned topology) +ir3_nir_lower_to_explicit_output(nir_shader *shader, struct ir3_shader_variant *v, + unsigned topology) { struct state state = { }; - build_primitive_map(shader, &state.map, &shader->outputs); - memcpy(s->output_loc, state.map.loc, sizeof(s->output_loc)); + build_primitive_map(shader, nir_var_shader_out, &state.map); + memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc)); nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); @@ -236,20 +250,88 @@ ir3_nir_lower_to_explicit_io(nir_shader *shader, struct ir3_shader *s, unsigned nir_builder_init(&b, impl); b.cursor = nir_before_cf_list(&impl->body); - if (s->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE) + if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE) state.header = nir_load_tcs_header_ir3(&b); else state.header = nir_load_gs_header_ir3(&b); - nir_foreach_block_safe(block, impl) - lower_vs_block(block, &b, &state); + nir_foreach_block_safe (block, impl) + lower_block_to_explicit_output(block, &b, &state); nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); - s->output_size = state.map.stride; + v->output_size = state.map.stride; +} + + +static void +lower_block_to_explicit_input(nir_block *block, nir_builder *b, struct state *state) +{ + nir_foreach_instr_safe (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_per_vertex_input: { + // src[] = { vertex, offset }. + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *offset = build_local_offset(b, state, + intr->src[0].ssa, // this is typically gl_InvocationID + nir_intrinsic_base(intr), + intr->src[1].ssa); + + replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL); + break; + } + + case nir_intrinsic_load_invocation_id: { + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *iid = build_invocation_id(b, state); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid)); + nir_instr_remove(&intr->instr); + break; + } + + default: + break; + } + } +} + +void +ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_compiler *compiler) +{ + struct state state = { }; + + /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS, + * HS uses a different primitive id, which starts at bit 16 in the header + */ + if (shader->info.stage == MESA_SHADER_TESS_CTRL && compiler->tess_use_shared) + state.local_primitive_id_start = 16; + + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + assert(impl); + + nir_builder b; + nir_builder_init(&b, impl); + b.cursor = nir_before_cf_list(&impl->body); + + if (shader->info.stage == MESA_SHADER_GEOMETRY) + state.header = nir_load_gs_header_ir3(&b); + else + state.header = nir_load_tcs_header_ir3(&b); + + nir_foreach_block_safe (block, impl) + lower_block_to_explicit_input(block, &b, &state); } + static nir_ssa_def * build_per_vertex_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex, nir_ssa_def *offset, nir_variable *var) @@ -286,26 +368,32 @@ build_patch_offset(nir_builder *b, struct state *state, nir_ssa_def *offset, nir return build_per_vertex_offset(b, state, nir_imm_int(b, 0), offset, var); } -static nir_ssa_def * -build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state) +static void +tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer) { - uint32_t inner_levels, outer_levels; switch (state->topology) { case IR3_TESS_TRIANGLES: - inner_levels = 1; - outer_levels = 3; + *inner = 1; + *outer = 3; break; case IR3_TESS_QUADS: - inner_levels = 2; - outer_levels = 4; + *inner = 2; + *outer = 4; break; case IR3_TESS_ISOLINES: - inner_levels = 0; - outer_levels = 2; + *inner = 0; + *outer = 2; break; default: unreachable("bad"); } +} + +static nir_ssa_def * +build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state) +{ + uint32_t inner_levels, outer_levels; + tess_level_components(state, &inner_levels, &outer_levels); const uint32_t patch_stride = 1 + inner_levels + outer_levels; @@ -332,22 +420,13 @@ build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state) static void lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state) { - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe (instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { - case nir_intrinsic_load_invocation_id: - b->cursor = nir_before_instr(&intr->instr); - - nir_ssa_def *invocation_id = build_invocation_id(b, state); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, - nir_src_for_ssa(invocation_id)); - nir_instr_remove(&intr->instr); - break; - case nir_intrinsic_control_barrier: case nir_intrinsic_memory_barrier_tcs_patch: /* Hull shaders dispatch 32 wide so an entire patch will always @@ -365,7 +444,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state) b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *address = nir_load_tess_param_base_ir3(b); - nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr)); nir_ssa_def *offset = build_per_vertex_offset(b, state, intr->src[0].ssa, intr->src[1].ssa, var); @@ -378,97 +457,94 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state) b->cursor = nir_before_instr(&intr->instr); + /* sparse writemask not supported */ + assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); + nir_ssa_def *value = intr->src[0].ssa; nir_ssa_def *address = nir_load_tess_param_base_ir3(b); - nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr)); nir_ssa_def *offset = build_per_vertex_offset(b, state, intr->src[1].ssa, intr->src[2].ssa, var); - nir_intrinsic_instr *store = - replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address, - nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)))); - - nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr)); + replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address, + nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)))); break; } - case nir_intrinsic_load_per_vertex_input: { - // src[] = { vertex, offset }. - - b->cursor = nir_before_instr(&intr->instr); - - nir_ssa_def *offset = build_local_offset(b, state, - intr->src[0].ssa, // this is typically gl_InvocationID - nir_intrinsic_base(intr), - intr->src[1].ssa); + case nir_intrinsic_load_output: { + // src[] = { offset }. - replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL); - break; - } + nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr)); - case nir_intrinsic_load_tess_level_inner: - case nir_intrinsic_load_tess_level_outer: { b->cursor = nir_before_instr(&intr->instr); - gl_varying_slot slot; - if (intr->intrinsic == nir_intrinsic_load_tess_level_inner) - slot = VARYING_SLOT_TESS_LEVEL_INNER; - else - slot = VARYING_SLOT_TESS_LEVEL_OUTER; + nir_ssa_def *address, *offset; - nir_ssa_def *address = nir_load_tess_factor_base_ir3(b); - nir_ssa_def *offset = build_tessfactor_base(b, slot, state); + /* note if vectorization of the tess level loads ever happens: + * "ldg" across 16-byte boundaries can behave incorrectly if results + * are never used. most likely some issue with (sy) not properly + * syncing with values coming from a second memory transaction. + */ + if (is_tess_levels(var)) { + assert(intr->dest.ssa.num_components == 1); + address = nir_load_tess_factor_base_ir3(b); + offset = build_tessfactor_base(b, var->data.location, state); + } else { + address = nir_load_tess_param_base_ir3(b); + offset = build_patch_offset(b, state, intr->src[0].ssa, var); + } replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); break; } - case nir_intrinsic_load_output: { - // src[] = { offset }. + case nir_intrinsic_store_output: { + // src[] = { value, offset }. + + /* write patch output to bo */ - nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr)); b->cursor = nir_before_instr(&intr->instr); - nir_ssa_def *address = nir_load_tess_param_base_ir3(b); - nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var); + /* sparse writemask not supported */ + assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)); - replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); - break; - } + if (is_tess_levels(var)) { + /* with tess levels are defined as float[4] and float[2], + * but tess factor BO has smaller sizes for tris/isolines, + * so we have to discard any writes beyond the number of + * components for inner/outer levels */ + uint32_t inner_levels, outer_levels, levels; + tess_level_components(state, &inner_levels, &outer_levels); - case nir_intrinsic_store_output: { - // src[] = { value, offset }. + if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) + levels = outer_levels; + else + levels = inner_levels; - /* write patch output to bo */ + assert(intr->src[0].ssa->num_components == 1); - nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr)); + nir_ssa_def *offset = + nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr)); - nir_ssa_def **levels = NULL; - if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) - levels = state->outer_levels; - else if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER) - levels = state->inner_levels; + nir_if *nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels))); - b->cursor = nir_before_instr(&intr->instr); + replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, + intr->src[0].ssa, + nir_load_tess_factor_base_ir3(b), + nir_iadd(b, offset, build_tessfactor_base(b, var->data.location, state))); - if (levels) { - for (int i = 0; i < 4; i++) - if (nir_intrinsic_write_mask(intr) & (1 << i)) - levels[i] = nir_channel(b, intr->src[0].ssa, i); - nir_instr_remove(&intr->instr); + nir_pop_if(b, nif); } else { nir_ssa_def *address = nir_load_tess_param_base_ir3(b); nir_ssa_def *offset = build_patch_offset(b, state, intr->src[1].ssa, var); debug_assert(nir_intrinsic_component(intr) == 0); - nir_intrinsic_instr *store = - replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, - intr->src[0].ssa, address, offset); - - nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intr)); + replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, + intr->src[0].ssa, address, offset); } break; } @@ -482,57 +558,9 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state) static void emit_tess_epilouge(nir_builder *b, struct state *state) { - nir_ssa_def *tessfactor_address = nir_load_tess_factor_base_ir3(b); - nir_ssa_def *levels[2]; - - /* Then emit the epilogue that actually writes out the tessellation levels - * to the BOs. - */ - switch (state->topology) { - case IR3_TESS_TRIANGLES: - levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1], - state->outer_levels[2], state->inner_levels[0]); - levels[1] = NULL; - break; - case IR3_TESS_QUADS: - levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1], - state->outer_levels[2], state->outer_levels[3]); - levels[1] = nir_vec2(b, state->inner_levels[0], state->inner_levels[1]); - break; - case IR3_TESS_ISOLINES: - levels[0] = nir_vec2(b, state->outer_levels[0], state->outer_levels[1]); - levels[1] = NULL; - break; - default: - unreachable("nope"); - } - - nir_ssa_def *offset = build_tessfactor_base(b, VARYING_SLOT_TESS_LEVEL_OUTER, state); - - nir_intrinsic_instr *store = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3); - - store->src[0] = nir_src_for_ssa(levels[0]); - store->src[1] = nir_src_for_ssa(tessfactor_address); - store->src[2] = nir_src_for_ssa(offset); - nir_builder_instr_insert(b, &store->instr); - store->num_components = levels[0]->num_components; - nir_intrinsic_set_write_mask(store, (1 << levels[0]->num_components) - 1); - - if (levels[1]) { - store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3); - offset = nir_iadd(b, offset, nir_imm_int(b, levels[0]->num_components)); - - store->src[0] = nir_src_for_ssa(levels[1]); - store->src[1] = nir_src_for_ssa(tessfactor_address); - store->src[2] = nir_src_for_ssa(offset); - nir_builder_instr_insert(b, &store->instr); - store->num_components = levels[1]->num_components; - nir_intrinsic_set_write_mask(store, (1 << levels[1]->num_components) - 1); - } - - /* Finally, Insert endpatch instruction, maybe signalling the tess engine - * that another primitive is ready? + /* Insert endpatch instruction: + * + * TODO we should re-work this to use normal flow control. */ nir_intrinsic_instr *end_patch = @@ -541,7 +569,8 @@ emit_tess_epilouge(nir_builder *b, struct state *state) } void -ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topology) +ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v, + unsigned topology) { struct state state = { .topology = topology }; @@ -551,9 +580,9 @@ ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topol nir_print_shader(shader, stderr); } - build_primitive_map(shader, &state.map, &shader->outputs); - memcpy(s->output_loc, state.map.loc, sizeof(s->output_loc)); - s->output_size = state.map.stride; + build_primitive_map(shader, nir_var_shader_out, &state.map); + memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc)); + v->output_size = state.map.stride; nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); @@ -564,7 +593,7 @@ ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topol state.header = nir_load_tcs_header_ir3(&b); - nir_foreach_block_safe(block, impl) + nir_foreach_block_safe (block, impl) lower_tess_ctrl_block(block, &b, &state); /* Now move the body of the TCS into a conditional: @@ -611,7 +640,7 @@ ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topol static void lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state) { - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe (instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; @@ -643,7 +672,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state) b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *address = nir_load_tess_param_base_ir3(b); - nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr)); + nir_variable *var = get_var(b->shader, nir_var_shader_in, nir_intrinsic_base(intr)); nir_ssa_def *offset = build_per_vertex_offset(b, state, intr->src[0].ssa, intr->src[1].ssa, var); @@ -651,57 +680,32 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state) break; } - case nir_intrinsic_load_tess_level_inner: - case nir_intrinsic_load_tess_level_outer: { - b->cursor = nir_before_instr(&intr->instr); - - gl_varying_slot slot; - if (intr->intrinsic == nir_intrinsic_load_tess_level_inner) - slot = VARYING_SLOT_TESS_LEVEL_INNER; - else - slot = VARYING_SLOT_TESS_LEVEL_OUTER; - - nir_ssa_def *address = nir_load_tess_factor_base_ir3(b); - nir_ssa_def *offset = build_tessfactor_base(b, slot, state); - - /* Loading across a vec4 (16b) memory boundary is problematic - * if we don't use components from the second vec4. The tess - * levels aren't guaranteed to be vec4 aligned and we don't - * know which levels are actually used, so we load each - * component individually. - */ - nir_ssa_def *levels[4]; - for (unsigned i = 0; i < intr->num_components; i++) { - nir_intrinsic_instr *new_intr = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_global_ir3); - - new_intr->src[0] = nir_src_for_ssa(address); - new_intr->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i))); - new_intr->num_components = 1; - nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, 1, 32, NULL); - nir_builder_instr_insert(b, &new_intr->instr); - levels[i] = &new_intr->dest.ssa; - } - - nir_ssa_def *v = nir_vec(b, levels, intr->num_components); - - nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(v)); - - nir_instr_remove(&intr->instr); - break; - } - case nir_intrinsic_load_input: { // src[] = { offset }. - nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr)); + nir_variable *var = get_var(b->shader, nir_var_shader_in, nir_intrinsic_base(intr)); debug_assert(var->data.patch); b->cursor = nir_before_instr(&intr->instr); - nir_ssa_def *address = nir_load_tess_param_base_ir3(b); - nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var); + nir_ssa_def *address, *offset; + + /* note if vectorization of the tess level loads ever happens: + * "ldg" across 16-byte boundaries can behave incorrectly if results + * are never used. most likely some issue with (sy) not properly + * syncing with values coming from a second memory transaction. + */ + if (is_tess_levels(var)) { + assert(intr->dest.ssa.num_components == 1); + address = nir_load_tess_factor_base_ir3(b); + offset = build_tessfactor_base(b, var->data.location, state); + } else { + address = nir_load_tess_param_base_ir3(b); + offset = build_patch_offset(b, state, intr->src[0].ssa, var); + } + + offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr))); replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL); break; @@ -725,7 +729,7 @@ ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology) } /* Build map of inputs so we have the sizes. */ - build_primitive_map(shader, &state.map, &shader->inputs); + build_primitive_map(shader, nir_var_shader_in, &state.map); nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); @@ -733,7 +737,7 @@ ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology) nir_builder b; nir_builder_init(&b, impl); - nir_foreach_block_safe(block, impl) + nir_foreach_block_safe (block, impl) lower_tess_eval_block(block, &b, &state); nir_metadata_preserve(impl, 0); @@ -742,47 +746,31 @@ ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology) static void lower_gs_block(nir_block *block, nir_builder *b, struct state *state) { - nir_intrinsic_instr *outputs[32] = {}; - - nir_foreach_instr_safe(instr, block) { + nir_foreach_instr_safe (instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { - case nir_intrinsic_store_output: { - // src[] = { value, offset }. - - uint32_t loc = nir_intrinsic_base(intr); - outputs[loc] = intr; - break; - } - case nir_intrinsic_end_primitive: { b->cursor = nir_before_instr(&intr->instr); - nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 4), 0x1); + nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1); nir_instr_remove(&intr->instr); break; } case nir_intrinsic_emit_vertex: { - /* Load the vertex count */ b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *count = nir_load_var(b, state->vertex_count_var); nir_push_if(b, nir_ieq(b, count, local_thread_id(b))); - for (uint32_t i = 0; i < ARRAY_SIZE(outputs); i++) { - if (outputs[i]) { - nir_store_var(b, state->output_vars[i], - outputs[i]->src[0].ssa, - (1 << outputs[i]->num_components) - 1); - - nir_instr_remove(&outputs[i]->instr); - } - outputs[i] = NULL; + foreach_two_lists(dest_node, &state->emit_outputs, src_node, &state->old_outputs) { + nir_variable *dest = exec_node_data(nir_variable, dest_node, node); + nir_variable *src = exec_node_data(nir_variable, src_node, node); + nir_copy_var(b, dest, src); } nir_instr_remove(&intr->instr); @@ -790,39 +778,13 @@ lower_gs_block(nir_block *block, nir_builder *b, struct state *state) nir_store_var(b, state->emitted_vertex_var, nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1); - nir_store_var(b, state->vertex_flags_out, - nir_load_var(b, state->vertex_flags_var), 0x1); - nir_pop_if(b, NULL); /* Increment the vertex count by 1 */ nir_store_var(b, state->vertex_count_var, nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */ - nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 0), 0x1); - - break; - } - - case nir_intrinsic_load_per_vertex_input: { - // src[] = { vertex, offset }. - - b->cursor = nir_before_instr(&intr->instr); - - nir_ssa_def *offset = build_local_offset(b, state, - intr->src[0].ssa, // this is typically gl_InvocationID - nir_intrinsic_base(intr), - intr->src[1].ssa); - - replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL); - break; - } + nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1); - case nir_intrinsic_load_invocation_id: { - b->cursor = nir_before_instr(&intr->instr); - - nir_ssa_def *iid = build_invocation_id(b, state); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid)); - nir_instr_remove(&intr->instr); break; } @@ -832,50 +794,8 @@ lower_gs_block(nir_block *block, nir_builder *b, struct state *state) } } -static void -emit_store_outputs(nir_builder *b, struct state *state) -{ - /* This also stores the internally added vertex_flags output. */ - - for (uint32_t i = 0; i < ARRAY_SIZE(state->output_vars); i++) { - if (!state->output_vars[i]) - continue; - - nir_intrinsic_instr *store = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output); - - nir_intrinsic_set_base(store, i); - store->src[0] = nir_src_for_ssa(nir_load_var(b, state->output_vars[i])); - store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); - store->num_components = store->src[0].ssa->num_components; - - nir_builder_instr_insert(b, &store->instr); - } -} - -static void -clean_up_split_vars(nir_shader *shader, struct exec_list *list) -{ - uint32_t components[32] = {}; - - nir_foreach_variable(var, list) { - uint32_t mask = - ((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac; - components[var->data.driver_location] |= mask; - } - - nir_foreach_variable_safe(var, list) { - uint32_t mask = - ((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac; - bool subset = - (components[var->data.driver_location] | mask) != mask; - if (subset) - exec_node_remove(&var->node); - } -} - void -ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s) +ir3_nir_lower_gs(nir_shader *shader) { struct state state = { }; @@ -884,21 +804,17 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s) nir_print_shader(shader, stderr); } - clean_up_split_vars(shader, &shader->inputs); - clean_up_split_vars(shader, &shader->outputs); - - build_primitive_map(shader, &state.map, &shader->inputs); - - uint32_t loc = 0; - nir_foreach_variable(var, &shader->outputs) { - uint32_t end = var->data.driver_location + glsl_count_attribute_slots(var->type, false); - loc = MAX2(loc, end); - } + build_primitive_map(shader, nir_var_shader_in, &state.map); + /* Create an output var for vertex_flags. This will be shadowed below, + * same way regular outputs get shadowed, and this variable will become a + * temporary. + */ state.vertex_flags_out = nir_variable_create(shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags"); - state.vertex_flags_out->data.driver_location = loc; + state.vertex_flags_out->data.driver_location = shader->num_outputs++; state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3; + state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE; nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); @@ -909,27 +825,55 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s) state.header = nir_load_gs_header_ir3(&b); - nir_foreach_variable(var, &shader->outputs) { - state.output_vars[var->data.driver_location] = - nir_local_variable_create(impl, var->type, - ralloc_asprintf(var, "%s:gs-temp", var->name)); + /* Generate two set of shadow vars for the output variables. The first + * set replaces the real outputs and the second set (emit_outputs) we'll + * assign in the emit_vertex conditionals. Then at the end of the shader + * we copy the emit_outputs to the real outputs, so that we get + * store_output in uniform control flow. + */ + exec_list_make_empty(&state.old_outputs); + nir_foreach_shader_out_variable_safe(var, shader) { + exec_node_remove(&var->node); + exec_list_push_tail(&state.old_outputs, &var->node); + } + exec_list_make_empty(&state.new_outputs); + exec_list_make_empty(&state.emit_outputs); + nir_foreach_variable_in_list(var, &state.old_outputs) { + /* Create a new output var by cloning the original output var and + * stealing the name. + */ + nir_variable *output = nir_variable_clone(var, shader); + exec_list_push_tail(&state.new_outputs, &output->node); + + /* Rewrite the original output to be a shadow variable. */ + var->name = ralloc_asprintf(var, "%s@gs-temp", output->name); + var->data.mode = nir_var_shader_temp; + + /* Clone the shadow variable to create the emit shadow variable that + * we'll assign in the emit conditionals. + */ + nir_variable *emit_output = nir_variable_clone(var, shader); + emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name); + exec_list_push_tail(&state.emit_outputs, &emit_output->node); } + /* During the shader we'll keep track of which vertex we're currently + * emitting for the EmitVertex test and how many vertices we emitted so we + * know to discard if didn't emit any. In most simple shaders, this can + * all be statically determined and gets optimized away. + */ state.vertex_count_var = nir_local_variable_create(impl, glsl_uint_type(), "vertex_count"); state.emitted_vertex_var = nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex"); - state.vertex_flags_var = - nir_local_variable_create(impl, glsl_uint_type(), "vertex_flags"); - state.vertex_flags_out = state.output_vars[state.vertex_flags_out->data.driver_location]; - /* initialize to 0 */ + /* Initialize to 0. */ b.cursor = nir_before_cf_list(&impl->body); nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1); nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1); - nir_store_var(&b, state.vertex_flags_var, nir_imm_int(&b, 4), 0x1); + nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1); - nir_foreach_block_safe(block, impl) + nir_foreach_block_safe (block, impl) lower_gs_block(block, &b, &state); set_foreach(impl->end_block->predecessors, block_entry) { @@ -945,13 +889,63 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s) nir_builder_instr_insert(&b, &discard_if->instr); - emit_store_outputs(&b, &state); + foreach_two_lists(dest_node, &state.new_outputs, src_node, &state.emit_outputs) { + nir_variable *dest = exec_node_data(nir_variable, dest_node, node); + nir_variable *src = exec_node_data(nir_variable, src_node, node); + nir_copy_var(&b, dest, src); + } } + exec_list_append(&shader->variables, &state.old_outputs); + exec_list_append(&shader->variables, &state.emit_outputs); + exec_list_append(&shader->variables, &state.new_outputs); + nir_metadata_preserve(impl, 0); + nir_lower_global_vars_to_local(shader); + nir_split_var_copies(shader); + nir_lower_var_copies(shader); + + nir_fixup_deref_modes(shader); + if (shader_debug_enabled(shader->info.stage)) { fprintf(stderr, "NIR (after gs lowering):\n"); nir_print_shader(shader, stderr); } } + +uint32_t +ir3_link_geometry_stages(const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *consumer, + uint32_t *locs) +{ + uint32_t num_loc = 0, factor; + + switch (consumer->type) { + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_GEOMETRY: + /* These stages load with ldlw, which expects byte offsets. */ + factor = 4; + break; + case MESA_SHADER_TESS_EVAL: + /* The tess eval shader uses ldg, which takes dword offsets. */ + factor = 1; + break; + default: + unreachable("bad shader stage"); + } + + nir_foreach_shader_in_variable(in_var, consumer->shader->nir) { + nir_foreach_shader_out_variable(out_var, producer->shader->nir) { + if (in_var->data.location == out_var->data.location) { + locs[in_var->data.driver_location] = + producer->output_loc[out_var->data.driver_location] * factor; + + debug_assert(num_loc <= in_var->data.driver_location + 1); + num_loc = in_var->data.driver_location + 1; + } + } + } + + return num_loc; +}