nir: get ffma support from NIR options for nir_lower_flrp
[mesa.git] / src / freedreno / ir3 / ir3_nir_lower_tess.c
index 458b29848a0118d09ba6348b4645230828c68602..44b2921dcb7b97742ebc322442444f1968bed970 100644 (file)
@@ -41,10 +41,11 @@ struct state {
        nir_variable *vertex_flags_out;
 
        struct exec_list old_outputs;
+       struct exec_list new_outputs;
        struct exec_list emit_outputs;
 
-       nir_ssa_def *outer_levels[4];
-       nir_ssa_def *inner_levels[2];
+       /* tess ctrl shader on a650 gets the local primitive id at different bits: */
+       unsigned local_primitive_id_start;
 };
 
 static nir_ssa_def *
@@ -69,13 +70,13 @@ build_vertex_id(nir_builder *b, struct state *state)
 static nir_ssa_def *
 build_local_primitive_id(nir_builder *b, struct state *state)
 {
-       return bitfield_extract(b, state->header, 0, 63);
+       return bitfield_extract(b, state->header, state->local_primitive_id_start, 63);
 }
 
 static nir_variable *
-get_var(struct exec_list *list, int driver_location)
+get_var(nir_shader *shader, nir_variable_mode mode, int driver_location)
 {
-       nir_foreach_variable (v, list) {
+       nir_foreach_variable_with_modes (v, shader, mode) {
                if (v->data.driver_location == driver_location) {
                        return v;
                }
@@ -84,6 +85,13 @@ get_var(struct exec_list *list, int driver_location)
        return NULL;
 }
 
+static bool
+is_tess_levels(nir_variable *var)
+{
+       return (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER ||
+                       var->data.location == VARYING_SLOT_TESS_LEVEL_INNER);
+}
+
 static nir_ssa_def *
 build_local_offset(nir_builder *b, struct state *state,
                nir_ssa_def *vertex, uint32_t base, nir_ssa_def *offset)
@@ -145,9 +153,9 @@ replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
 }
 
 static void
-build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_list *list)
+build_primitive_map(nir_shader *shader, nir_variable_mode mode, struct primitive_map *map)
 {
-       nir_foreach_variable (var, list) {
+       nir_foreach_variable_with_modes (var, shader, mode) {
                switch (var->data.location) {
                case VARYING_SLOT_TESS_LEVEL_OUTER:
                case VARYING_SLOT_TESS_LEVEL_INNER:
@@ -165,7 +173,7 @@ build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_l
        for (uint32_t i = 0; i < ARRAY_SIZE(map->size); i++) {
                if (map->size[i] == 0)
                                continue;
-               nir_variable *var = get_var(list, i);
+               nir_variable *var = get_var(shader, mode, i);
                map->loc[i] = loc;
                loc += map->size[i];
 
@@ -232,7 +240,7 @@ ir3_nir_lower_to_explicit_output(nir_shader *shader, struct ir3_shader_variant *
 {
        struct state state = { };
 
-       build_primitive_map(shader, &state.map, &shader->outputs);
+       build_primitive_map(shader, nir_var_shader_out, &state.map);
        memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
 
        nir_function_impl *impl = nir_shader_get_entrypoint(shader);
@@ -297,10 +305,16 @@ lower_block_to_explicit_input(nir_block *block, nir_builder *b, struct state *st
 }
 
 void
-ir3_nir_lower_to_explicit_input(nir_shader *shader)
+ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_compiler *compiler)
 {
        struct state state = { };
 
+       /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
+        * HS uses a different primitive id, which starts at bit 16 in the header
+        */
+       if (shader->info.stage == MESA_SHADER_TESS_CTRL && compiler->tess_use_shared)
+               state.local_primitive_id_start = 16;
+
        nir_function_impl *impl = nir_shader_get_entrypoint(shader);
        assert(impl);
 
@@ -354,26 +368,32 @@ build_patch_offset(nir_builder *b, struct state *state, nir_ssa_def *offset, nir
        return build_per_vertex_offset(b, state, nir_imm_int(b, 0), offset, var);
 }
 
-static nir_ssa_def *
-build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
+static void
+tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
 {
-       uint32_t inner_levels, outer_levels;
        switch (state->topology) {
        case IR3_TESS_TRIANGLES:
-               inner_levels = 1;
-               outer_levels = 3;
+               *inner = 1;
+               *outer = 3;
                break;
        case IR3_TESS_QUADS:
-               inner_levels = 2;
-               outer_levels = 4;
+               *inner = 2;
+               *outer = 4;
                break;
        case IR3_TESS_ISOLINES:
-               inner_levels = 0;
-               outer_levels = 2;
+               *inner = 0;
+               *outer = 2;
                break;
        default:
                unreachable("bad");
        }
+}
+
+static nir_ssa_def *
+build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
+{
+       uint32_t inner_levels, outer_levels;
+       tess_level_components(state, &inner_levels, &outer_levels);
 
        const uint32_t patch_stride = 1 + inner_levels + outer_levels;
 
@@ -424,7 +444,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                        b->cursor = nir_before_instr(&intr->instr);
 
                        nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
+                       nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr));
                        nir_ssa_def *offset = build_per_vertex_offset(b, state,
                                        intr->src[0].ssa, intr->src[1].ssa, var);
 
@@ -437,16 +457,12 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       /* nir_lower_io_to_temporaries replaces all access to output
-                        * variables with temp variables and then emits a nir_copy_var at
-                        * the end of the shader.  Thus, we should always get a full wrmask
-                        * here.
-                        */
+                       /* sparse writemask not supported */
                        assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
 
                        nir_ssa_def *value = intr->src[0].ssa;
                        nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
+                       nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr));
                        nir_ssa_def *offset = build_per_vertex_offset(b, state,
                                        intr->src[1].ssa, intr->src[2].ssa, var);
 
@@ -456,32 +472,28 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                        break;
                }
 
-               case nir_intrinsic_load_tess_level_inner:
-               case nir_intrinsic_load_tess_level_outer: {
-                       b->cursor = nir_before_instr(&intr->instr);
-
-                       gl_varying_slot slot;
-                       if (intr->intrinsic == nir_intrinsic_load_tess_level_inner)
-                               slot = VARYING_SLOT_TESS_LEVEL_INNER;
-                       else
-                               slot = VARYING_SLOT_TESS_LEVEL_OUTER;
-
-                       nir_ssa_def *address = nir_load_tess_factor_base_ir3(b);
-                       nir_ssa_def *offset = build_tessfactor_base(b, slot, state);
-
-                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
-                       break;
-               }
-
                case nir_intrinsic_load_output: {
                        // src[] = { offset }.
 
-                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
+                       nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr));
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       nir_ssa_def *address, *offset;
+
+                       /* note if vectorization of the tess level loads ever happens:
+                        * "ldg" across 16-byte boundaries can behave incorrectly if results
+                        * are never used. most likely some issue with (sy) not properly
+                        * syncing with values coming from a second memory transaction.
+                        */
+                       if (is_tess_levels(var)) {
+                               assert(intr->dest.ssa.num_components == 1);
+                               address = nir_load_tess_factor_base_ir3(b);
+                               offset = build_tessfactor_base(b, var->data.location, state);
+                       } else {
+                               address = nir_load_tess_param_base_ir3(b);
+                               offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       }
 
                        replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
                        break;
@@ -492,34 +504,45 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
                        /* write patch output to bo */
 
-                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
-
-                       nir_ssa_def **levels = NULL;
-                       if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
-                               levels = state->outer_levels;
-                       else if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER)
-                               levels = state->inner_levels;
+                       nir_variable *var = get_var(b->shader, nir_var_shader_out, nir_intrinsic_base(intr));
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       if (levels) {
-                               for (int i = 0; i < 4; i++)
-                                       if (nir_intrinsic_write_mask(intr) & (1 << i))
-                                               levels[i] = nir_channel(b, intr->src[0].ssa, i);
-                               nir_instr_remove(&intr->instr);
+                       /* sparse writemask not supported */
+                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+                       if (is_tess_levels(var)) {
+                               /* with tess levels are defined as float[4] and float[2],
+                                * but tess factor BO has smaller sizes for tris/isolines,
+                                * so we have to discard any writes beyond the number of
+                                * components for inner/outer levels */
+                               uint32_t inner_levels, outer_levels, levels;
+                               tess_level_components(state, &inner_levels, &outer_levels);
+
+                               if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+                                       levels = outer_levels;
+                               else
+                                       levels = inner_levels;
+
+                               assert(intr->src[0].ssa->num_components == 1);
+
+                               nir_ssa_def *offset =
+                                       nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr));
+
+                               nir_if *nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
+
+                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+                                               intr->src[0].ssa,
+                                               nir_load_tess_factor_base_ir3(b),
+                                               nir_iadd(b, offset, build_tessfactor_base(b, var->data.location, state)));
+
+                               nir_pop_if(b, nif);
                        } else {
                                nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
                                nir_ssa_def *offset = build_patch_offset(b, state, intr->src[1].ssa, var);
 
                                debug_assert(nir_intrinsic_component(intr) == 0);
 
-                               /* nir_lower_io_to_temporaries replaces all access to output
-                                * variables with temp variables and then emits a nir_copy_var at
-                                * the end of the shader.  Thus, we should always get a full wrmask
-                                * here.
-                                */
-                               assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
-
                                replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
                                                intr->src[0].ssa, address, offset);
                        }
@@ -535,57 +558,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 static void
 emit_tess_epilouge(nir_builder *b, struct state *state)
 {
-       nir_ssa_def *tessfactor_address = nir_load_tess_factor_base_ir3(b);
-       nir_ssa_def *levels[2];
-
-       if (!state->outer_levels[0])
-               return;
-
-       /* Then emit the epilogue that actually writes out the tessellation levels
-        * to the BOs.
-        */
-       switch (state->topology) {
-       case IR3_TESS_TRIANGLES:
-               levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1],
-                               state->outer_levels[2], state->inner_levels[0]);
-               levels[1] = NULL;
-               break;
-       case IR3_TESS_QUADS:
-               levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1],
-                               state->outer_levels[2], state->outer_levels[3]);
-               levels[1] = nir_vec2(b, state->inner_levels[0], state->inner_levels[1]);
-               break;
-       case IR3_TESS_ISOLINES:
-               levels[0] = nir_vec2(b, state->outer_levels[0], state->outer_levels[1]);
-               levels[1] = NULL;
-               break;
-       default:
-               unreachable("nope");
-       }
-
-       nir_ssa_def *offset = build_tessfactor_base(b, VARYING_SLOT_TESS_LEVEL_OUTER, state);
-
-       nir_intrinsic_instr *store =
-               nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
-
-       store->src[0] = nir_src_for_ssa(levels[0]);
-       store->src[1] = nir_src_for_ssa(tessfactor_address);
-       store->src[2] = nir_src_for_ssa(offset);
-       nir_builder_instr_insert(b, &store->instr);
-       store->num_components = levels[0]->num_components;
-
-       if (levels[1]) {
-               store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
-               offset = nir_iadd(b, offset, nir_imm_int(b, levels[0]->num_components));
-
-               store->src[0] = nir_src_for_ssa(levels[1]);
-               store->src[1] = nir_src_for_ssa(tessfactor_address);
-               store->src[2] = nir_src_for_ssa(offset);
-               nir_builder_instr_insert(b, &store->instr);
-               store->num_components = levels[1]->num_components;
-       }
-
-       /* Finally, Insert endpatch instruction:
+       /* Insert endpatch instruction:
         *
         * TODO we should re-work this to use normal flow control.
         */
@@ -607,7 +580,7 @@ ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
                nir_print_shader(shader, stderr);
        }
 
-       build_primitive_map(shader, &state.map, &shader->outputs);
+       build_primitive_map(shader, nir_var_shader_out, &state.map);
        memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
        v->output_size = state.map.stride;
 
@@ -699,7 +672,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
                        b->cursor = nir_before_instr(&intr->instr);
 
                        nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr));
+                       nir_variable *var = get_var(b->shader, nir_var_shader_in, nir_intrinsic_base(intr));
                        nir_ssa_def *offset = build_per_vertex_offset(b, state,
                                        intr->src[0].ssa, intr->src[1].ssa, var);
 
@@ -707,58 +680,32 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
                        break;
                }
 
-               case nir_intrinsic_load_tess_level_inner:
-               case nir_intrinsic_load_tess_level_outer: {
-                               unsigned dest_comp = nir_intrinsic_dest_components(intr);
-                               b->cursor = nir_before_instr(&intr->instr);
-
-                               gl_varying_slot slot;
-                               if (intr->intrinsic == nir_intrinsic_load_tess_level_inner)
-                                       slot = VARYING_SLOT_TESS_LEVEL_INNER;
-                               else
-                                       slot = VARYING_SLOT_TESS_LEVEL_OUTER;
-
-                               nir_ssa_def *address = nir_load_tess_factor_base_ir3(b);
-                               nir_ssa_def *offset = build_tessfactor_base(b, slot, state);
-
-                               /* Loading across a vec4 (16b) memory boundary is problematic
-                                * if we don't use components from the second vec4.  The tess
-                                * levels aren't guaranteed to be vec4 aligned and we don't
-                                * know which levels are actually used, so we load each
-                                * component individually.
-                                */
-                               nir_ssa_def *levels[4];
-                               for (unsigned i = 0; i < dest_comp; i++) {
-                                       nir_intrinsic_instr *new_intr =
-                                               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_global_ir3);
-
-                                       new_intr->src[0] = nir_src_for_ssa(address);
-                                       new_intr->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i)));
-                                       new_intr->num_components = 1;
-                                       nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, 1, 32, NULL);
-                                       nir_builder_instr_insert(b, &new_intr->instr);
-                                       levels[i] = &new_intr->dest.ssa;
-                               }
-
-                               nir_ssa_def *v = nir_vec(b, levels, dest_comp);
-
-                               nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(v));
-
-                               nir_instr_remove(&intr->instr);
-                               break;
-               }
-
                case nir_intrinsic_load_input: {
                        // src[] = { offset }.
 
-                       nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr));
+                       nir_variable *var = get_var(b->shader, nir_var_shader_in, nir_intrinsic_base(intr));
 
                        debug_assert(var->data.patch);
 
                        b->cursor = nir_before_instr(&intr->instr);
 
-                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
-                       nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       nir_ssa_def *address, *offset;
+
+                       /* note if vectorization of the tess level loads ever happens:
+                        * "ldg" across 16-byte boundaries can behave incorrectly if results
+                        * are never used. most likely some issue with (sy) not properly
+                        * syncing with values coming from a second memory transaction.
+                        */
+                       if (is_tess_levels(var)) {
+                               assert(intr->dest.ssa.num_components == 1);
+                               address = nir_load_tess_factor_base_ir3(b);
+                               offset = build_tessfactor_base(b, var->data.location, state);
+                       } else {
+                               address = nir_load_tess_param_base_ir3(b);
+                               offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+                       }
+
+                       offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)));
 
                        replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
                        break;
@@ -782,7 +729,7 @@ ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology)
        }
 
        /* Build map of inputs so we have the sizes. */
-       build_primitive_map(shader, &state.map, &shader->inputs);
+       build_primitive_map(shader, nir_var_shader_in, &state.map);
 
        nir_function_impl *impl = nir_shader_get_entrypoint(shader);
        assert(impl);
@@ -857,7 +804,7 @@ ir3_nir_lower_gs(nir_shader *shader)
                nir_print_shader(shader, stderr);
        }
 
-       build_primitive_map(shader, &state.map, &shader->inputs);
+       build_primitive_map(shader, nir_var_shader_in, &state.map);
 
        /* Create an output var for vertex_flags. This will be shadowed below,
         * same way regular outputs get shadowed, and this variable will become a
@@ -884,14 +831,19 @@ ir3_nir_lower_gs(nir_shader *shader)
         * we copy the emit_outputs to the real outputs, so that we get
         * store_output in uniform control flow.
         */
-       exec_list_move_nodes_to(&shader->outputs, &state.old_outputs);
+       exec_list_make_empty(&state.old_outputs);
+       nir_foreach_shader_out_variable_safe(var, shader) {
+               exec_node_remove(&var->node);
+               exec_list_push_tail(&state.old_outputs, &var->node);
+       }
+       exec_list_make_empty(&state.new_outputs);
        exec_list_make_empty(&state.emit_outputs);
-       nir_foreach_variable(var, &state.old_outputs) {
+       nir_foreach_variable_in_list(var, &state.old_outputs) {
                /* Create a new output var by cloning the original output var and
                 * stealing the name.
                 */
                nir_variable *output = nir_variable_clone(var, shader);
-               exec_list_push_tail(&shader->outputs, &output->node);
+               exec_list_push_tail(&state.new_outputs, &output->node);
 
                /* Rewrite the original output to be a shadow variable. */
                var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
@@ -937,15 +889,16 @@ ir3_nir_lower_gs(nir_shader *shader)
 
                nir_builder_instr_insert(&b, &discard_if->instr);
 
-               foreach_two_lists(dest_node, &shader->outputs, src_node, &state.emit_outputs) {
+               foreach_two_lists(dest_node, &state.new_outputs, src_node, &state.emit_outputs) {
                        nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
                        nir_variable *src = exec_node_data(nir_variable, src_node, node);
                        nir_copy_var(&b, dest, src);
                }
        }
 
-       exec_list_append(&shader->globals, &state.old_outputs);
-       exec_list_append(&shader->globals, &state.emit_outputs);
+       exec_list_append(&shader->variables, &state.old_outputs);
+       exec_list_append(&shader->variables, &state.emit_outputs);
+       exec_list_append(&shader->variables, &state.new_outputs);
 
        nir_metadata_preserve(impl, 0);
 
@@ -982,8 +935,8 @@ ir3_link_geometry_stages(const struct ir3_shader_variant *producer,
                unreachable("bad shader stage");
        }
 
-       nir_foreach_variable(in_var, &consumer->shader->nir->inputs) {
-               nir_foreach_variable(out_var, &producer->shader->nir->outputs) {
+       nir_foreach_shader_in_variable(in_var, consumer->shader->nir) {
+               nir_foreach_shader_out_variable(out_var, producer->shader->nir) {
                        if (in_var->data.location == out_var->data.location) {
                                locs[in_var->data.driver_location] =
                                        producer->output_loc[out_var->data.driver_location] * factor;