freedreno/ir3: fix/rework tess levels
authorJonathan Marek <jonathan@marek.ca>
Fri, 3 Jul 2020 15:34:47 +0000 (11:34 -0400)
committerJonathan Marek <jonathan@marek.ca>
Mon, 6 Jul 2020 12:48:06 +0000 (08:48 -0400)
The previous version assumes tess level outputs will only be written once
in the shader, however its not possible to guarantee that.

It also assumes all invocations will write all the levels, which is also
not guaranteed.

This is required to fix the "tesselation" and "terraintessellation" demos
with turnip.

The comment about nir_lower_io_to_temporaries in lower_tess_ctrl_block is
removed because nir_lower_io_to_temporaries specifically skips TESS_CTRL
shaders so the comment doesn't make sense.

The split load for tess levels workaround is removed, the new version only
has scalar access unless if ever gets vectorized.

This sets NIR_COMPACT_ARRAYS cap to avoid the glsl tess vec lowering with
gallium. It seems this will also disable "LowerCombinedClipCullDistance",
which I'm not sure was needed or not.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5744>

src/freedreno/ir3/ir3_nir_lower_tess.c
src/freedreno/vulkan/tu_shader.c
src/gallium/drivers/freedreno/freedreno_screen.c

index bdb98f3d935a85d0221f06f81d975b22873aa8c3..4c06b458665f1484b5668b37722c3bb76e77c211 100644 (file)
@@ -42,9 +42,6 @@ struct state {
 
        struct exec_list old_outputs;
        struct exec_list emit_outputs;
-
-       nir_ssa_def *outer_levels[4];
-       nir_ssa_def *inner_levels[2];
 };
 
 static nir_ssa_def *
@@ -84,6 +81,13 @@ get_var(struct exec_list *list, int driver_location)
        return NULL;
 }
 
+static bool
+is_tess_levels(nir_variable *var)
+{
+       return (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER ||
+                       var->data.location == VARYING_SLOT_TESS_LEVEL_INNER);
+}
+
 static nir_ssa_def *
 build_local_offset(nir_builder *b, struct state *state,
                nir_ssa_def *vertex, uint32_t base, nir_ssa_def *offset)
@@ -354,26 +358,32 @@ build_patch_offset(nir_builder *b, struct state *state, nir_ssa_def *offset, nir
        return build_per_vertex_offset(b, state, nir_imm_int(b, 0), offset, var);
 }
 
-static nir_ssa_def *
-build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
+static void
+tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
 {
-       uint32_t inner_levels, outer_levels;
        switch (state->topology) {
        case IR3_TESS_TRIANGLES:
-               inner_levels = 1;
-               outer_levels = 3;
+               *inner = 1;
+               *outer = 3;
                break;
        case IR3_TESS_QUADS:
-               inner_levels = 2;
-               outer_levels = 4;
+               *inner = 2;
+               *outer = 4;
                break;
        case IR3_TESS_ISOLINES:
-               inner_levels = 0;
-               outer_levels = 2;
+               *inner = 0;
+               *outer = 2;
                break;
        default:
                unreachable("bad");
        }
+}
+
+static nir_ssa_def *
+build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
+{
+       uint32_t inner_levels, outer_levels;
+       tess_level_components(state, &inner_levels, &outer_levels);
 
        const uint32_t patch_stride = 1 + inner_levels + outer_levels;
 
@@ -437,11 +447,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       /* nir_lower_io_to_temporaries replaces all access to output
-                        * variables with temp variables and then emits a nir_copy_var at
-                        * the end of the shader.  Thus, we should always get a full wrmask
-                        * here.
-                        */
+                       /* sparse writemask not supported */
                        assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
 
                        nir_ssa_def *value = intr->src[0].ssa;
@@ -456,23 +462,6 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                        break;
                }
 
-               case nir_intrinsic_load_tess_level_inner:
-               case nir_intrinsic_load_tess_level_outer: {
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       gl_varying_slot slot;
-                       if (intr->intrinsic == nir_intrinsic_load_tess_level_inner)
-                               slot = VARYING_SLOT_TESS_LEVEL_INNER;
-                       else
-                               slot = VARYING_SLOT_TESS_LEVEL_OUTER;
-
-                       nir_ssa_def *address = nir_load_tess_factor_base_ir3(b);
-                       nir_ssa_def *offset = build_tessfactor_base(b, slot, state);
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
-                       break;
-               }
-
                case nir_intrinsic_load_output: {
                        // src[] = { offset }.
 
@@ -480,8 +469,21 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       nir_ssa_def *address, *offset;
+
+                       /* note if vectorization of the tess level loads ever happens:
+                        * "ldg" across 16-byte boundaries can behave incorrectly if results
+                        * are never used. most likely some issue with (sy) not properly
+                        * syncing with values coming from a second memory transaction.
+                        */
+                       if (is_tess_levels(var)) {
+                               assert(intr->dest.ssa.num_components == 1);
+                               address = nir_load_tess_factor_base_ir3(b);
+                               offset = build_tessfactor_base(b, var->data.location, state);
+                       } else {
+                               address = nir_load_tess_param_base_ir3(b);
+                               offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       }
 
                        replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
                        break;
@@ -494,35 +496,43 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                        nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
 
-                       nir_ssa_def **levels = NULL;
-                       if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
-                               levels = state->outer_levels;
-                       else if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER)
-                               levels = state->inner_levels;
-
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       if (levels) {
-                               for (int i = 0; i < 4; i++) {
-                                       if (nir_intrinsic_write_mask(intr) & (1 << i)) {
-                                               uint32_t component = nir_intrinsic_component(intr);
-                                               levels[i + component] = nir_channel(b, intr->src[0].ssa, i);
-                                       }
-                               }
-                               nir_instr_remove(&intr->instr);
+                       /* sparse writemask not supported */
+                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+                       if (is_tess_levels(var)) {
+                               /* with tess levels are defined as float[4] and float[2],
+                                * but tess factor BO has smaller sizes for tris/isolines,
+                                * so we have to discard any writes beyond the number of
+                                * components for inner/outer levels */
+                               uint32_t inner_levels, outer_levels, levels;
+                               tess_level_components(state, &inner_levels, &outer_levels);
+
+                               if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+                                       levels = outer_levels;
+                               else
+                                       levels = inner_levels;
+
+                               assert(intr->src[0].ssa->num_components == 1);
+
+                               nir_ssa_def *offset =
+                                       nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr));
+
+                               nir_if *nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
+
+                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+                                               intr->src[0].ssa,
+                                               nir_load_tess_factor_base_ir3(b),
+                                               nir_iadd(b, offset, build_tessfactor_base(b, var->data.location, state)));
+
+                               nir_pop_if(b, nif);
                        } else {
                                nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
                                nir_ssa_def *offset = build_patch_offset(b, state, intr->src[1].ssa, var);
 
                                debug_assert(nir_intrinsic_component(intr) == 0);
 
-                               /* nir_lower_io_to_temporaries replaces all access to output
-                                * variables with temp variables and then emits a nir_copy_var at
-                                * the end of the shader.  Thus, we should always get a full wrmask
-                                * here.
-                                */
-                               assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
-
                                replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
                                                intr->src[0].ssa, address, offset);
                        }
@@ -538,57 +548,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 static void
 emit_tess_epilouge(nir_builder *b, struct state *state)
 {
-       nir_ssa_def *tessfactor_address = nir_load_tess_factor_base_ir3(b);
-       nir_ssa_def *levels[2];
-
-       if (!state->outer_levels[0])
-               return;
-
-       /* Then emit the epilogue that actually writes out the tessellation levels
-        * to the BOs.
-        */
-       switch (state->topology) {
-       case IR3_TESS_TRIANGLES:
-               levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1],
-                               state->outer_levels[2], state->inner_levels[0]);
-               levels[1] = NULL;
-               break;
-       case IR3_TESS_QUADS:
-               levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1],
-                               state->outer_levels[2], state->outer_levels[3]);
-               levels[1] = nir_vec2(b, state->inner_levels[0], state->inner_levels[1]);
-               break;
-       case IR3_TESS_ISOLINES:
-               levels[0] = nir_vec2(b, state->outer_levels[0], state->outer_levels[1]);
-               levels[1] = NULL;
-               break;
-       default:
-               unreachable("nope");
-       }
-
-       nir_ssa_def *offset = build_tessfactor_base(b, VARYING_SLOT_TESS_LEVEL_OUTER, state);
-
-       nir_intrinsic_instr *store =
-               nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
-
-       store->src[0] = nir_src_for_ssa(levels[0]);
-       store->src[1] = nir_src_for_ssa(tessfactor_address);
-       store->src[2] = nir_src_for_ssa(offset);
-       nir_builder_instr_insert(b, &store->instr);
-       store->num_components = levels[0]->num_components;
-
-       if (levels[1]) {
-               store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
-               offset = nir_iadd(b, offset, nir_imm_int(b, levels[0]->num_components));
-
-               store->src[0] = nir_src_for_ssa(levels[1]);
-               store->src[1] = nir_src_for_ssa(tessfactor_address);
-               store->src[2] = nir_src_for_ssa(offset);
-               nir_builder_instr_insert(b, &store->instr);
-               store->num_components = levels[1]->num_components;
-       }
-
-       /* Finally, Insert endpatch instruction:
+       /* Insert endpatch instruction:
         *
         * TODO we should re-work this to use normal flow control.
         */
@@ -710,47 +670,6 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
                        break;
                }
 
-               case nir_intrinsic_load_tess_level_inner:
-               case nir_intrinsic_load_tess_level_outer: {
-                               unsigned dest_comp = nir_intrinsic_dest_components(intr);
-                               b->cursor = nir_before_instr(&intr->instr);
-
-                               gl_varying_slot slot;
-                               if (intr->intrinsic == nir_intrinsic_load_tess_level_inner)
-                                       slot = VARYING_SLOT_TESS_LEVEL_INNER;
-                               else
-                                       slot = VARYING_SLOT_TESS_LEVEL_OUTER;
-
-                               nir_ssa_def *address = nir_load_tess_factor_base_ir3(b);
-                               nir_ssa_def *offset = build_tessfactor_base(b, slot, state);
-
-                               /* Loading across a vec4 (16b) memory boundary is problematic
-                                * if we don't use components from the second vec4.  The tess
-                                * levels aren't guaranteed to be vec4 aligned and we don't
-                                * know which levels are actually used, so we load each
-                                * component individually.
-                                */
-                               nir_ssa_def *levels[4];
-                               for (unsigned i = 0; i < dest_comp; i++) {
-                                       nir_intrinsic_instr *new_intr =
-                                               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_global_ir3);
-
-                                       new_intr->src[0] = nir_src_for_ssa(address);
-                                       new_intr->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i)));
-                                       new_intr->num_components = 1;
-                                       nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, 1, 32, NULL);
-                                       nir_builder_instr_insert(b, &new_intr->instr);
-                                       levels[i] = &new_intr->dest.ssa;
-                               }
-
-                               nir_ssa_def *v = nir_vec(b, levels, dest_comp);
-
-                               nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(v));
-
-                               nir_instr_remove(&intr->instr);
-                               break;
-               }
-
                case nir_intrinsic_load_input: {
                        // src[] = { offset }.
 
@@ -760,8 +679,23 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       nir_ssa_def *address, *offset;
+
+                       /* note if vectorization of the tess level loads ever happens:
+                        * "ldg" across 16-byte boundaries can behave incorrectly if results
+                        * are never used. most likely some issue with (sy) not properly
+                        * syncing with values coming from a second memory transaction.
+                        */
+                       if (is_tess_levels(var)) {
+                               assert(intr->dest.ssa.num_components == 1);
+                               address = nir_load_tess_factor_base_ir3(b);
+                               offset = build_tessfactor_base(b, var->data.location, state);
+                       } else {
+                               address = nir_load_tess_param_base_ir3(b);
+                               offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       }
+
+                       offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)));
 
                        replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
                        break;
index 0fe4f63b3153c456a9d071f3c8dace12e5fb5f8f..50397ff72dbaa8ee28c701f79c29cbd74072dd9d 100644 (file)
@@ -43,8 +43,6 @@ tu_spirv_to_nir(struct ir3_compiler *compiler,
    const struct spirv_to_nir_options spirv_options = {
       .frag_coord_is_sysval = true,
       .lower_ubo_ssbo_access_to_offsets = true,
-      .tess_levels_are_sysvals = true,
-      .lower_tess_levels_to_vec = true,
       .caps = {
          .transform_feedback = true,
          .tessellation = true,
index 0c7889dcd0b09c3bebcdb9955341868cfd75f9f0..0ab93801ba867abc5031cde10f37e4549242913a 100644 (file)
@@ -198,6 +198,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_TEXTURE_BARRIER:
        case PIPE_CAP_INVALIDATE_BUFFER:
        case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+       case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
+       case PIPE_CAP_NIR_COMPACT_ARRAYS:
                return 1;
 
        case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: