panfrost/midgard: Use fancy iterator

[mesa.git] / src / gallium / drivers / panfrost / midgard / midgard_compile.c
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c

index c519193a56a98c52fc0b73b6552df48be55e75c2..f29f938215a14d070ed5a1310106b04e20cda1ac 100644 (file)
--- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
@@ -351,6 +351,7 @@ optimise_nir(nir_shader *nir)
  
          NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
          NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
+        NIR_PASS(progress, nir, nir_lower_idiv);
  
          nir_lower_tex_options lower_tex_options = {
                  .lower_rect = true
@@ -682,7 +683,10 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                  ALU_CASE(iadd, iadd);
                  ALU_CASE(isub, isub);
                  ALU_CASE(imul, imul);
-                ALU_CASE(iabs, iabs);
+
+                /* Zero shoved as second-arg */
+                ALU_CASE(iabs, iabsdiff);
+
                  ALU_CASE(mov, imov);
  
                  ALU_CASE(feq32, feq);
@@ -927,7 +931,8 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                  }
  
                  ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
-        } else if (instr->op == nir_op_f2b32 || instr->op == nir_op_i2b32) {
+        } else if (nr_inputs == 1 && !quirk_flipped_r24) {
+                /* Lots of instructions need a 0 plonked in */
                  ins.ssa_args.inline_constant = false;
                  ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
                  ins.has_constants = true;
@@ -1120,6 +1125,9 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                          offset += nir_src_as_uint(instr->src[0]);
                  }
  
+                /* We may need to apply a fractional offset */
+                int component = instr->intrinsic == nir_intrinsic_load_input ?
+                        nir_intrinsic_component(instr) : 0;
                  reg = nir_dest_index(ctx, &instr->dest);
  
                  if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) {
@@ -1130,6 +1138,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
  
                          midgard_instruction ins = m_ld_vary_32(reg, offset);
                          ins.load_store.mask = (1 << nr_comp) - 1;
+                        ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component);
  
                          midgard_varying_parameter p = {
                                  .is_varying = 1,
@@ -1221,44 +1230,23 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                          ctx->fragment_output = reg;
                  } else if (ctx->stage == MESA_SHADER_VERTEX) {
                          /* Varyings are written into one of two special
-                         * varying register, r26 or r27. The register itself is selected as the register
-                         * in the st_vary instruction, minus the base of 26. E.g. write into r27 and then call st_vary(1)
-                         *
-                         * Normally emitting fmov's is frowned upon,
-                         * but due to unique constraints of
-                         * REGISTER_VARYING, fmov emission + a
-                         * dedicated cleanup pass is the only way to
-                         * guarantee correctness when considering some
-                         * (common) edge cases XXX: FIXME */
-
-                        /* If this varying corresponds to a constant (why?!),
-                         * emit that now since it won't get picked up by
-                         * hoisting (since there is no corresponding move
-                         * emitted otherwise) */
-
-                        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, reg + 1);
-
-                        if (constant_value) {
-                                /* Special case: emit the varying write
-                                 * directly to r26 (looks funny in asm but it's
-                                 * fine) and emit the store _now_. Possibly
-                                 * slightly slower, but this is a really stupid
-                                 * special case anyway (why on earth would you
-                                 * have a constant varying? Your own fault for
-                                 * slightly worse perf :P) */
-
-                                midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(26));
-                                attach_constants(ctx, &ins, constant_value, reg + 1);
-                                emit_mir_instruction(ctx, ins);
+                         * varying register, r26 or r27. The register itself is
+                         * selected as the register in the st_vary instruction,
+                         * minus the base of 26. E.g. write into r27 and then
+                         * call st_vary(1) */
  
-                                midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
-                                st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
-                                emit_mir_instruction(ctx, st);
-                        } else {
-                                /* Do not emit the varying yet -- instead, just mark down that we need to later */
+                        midgard_instruction ins = v_fmov(reg, blank_alu_src, SSA_FIXED_REGISTER(26));
+                        emit_mir_instruction(ctx, ins);
  
-                                _mesa_hash_table_u64_insert(ctx->ssa_varyings, reg + 1, (void *) ((uintptr_t) (offset + 1)));
-                        }
+                        /* We should have been vectorized. That also lets us
+                         * ignore the mask. because the mask component on
+                         * st_vary is (as far as I can tell) ignored [the blob
+                         * sets it to zero] */
+                        assert(nir_intrinsic_component(instr) == 0);
+
+                        midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
+                        st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
+                        emit_mir_instruction(ctx, st);
                  } else {
                          DBG("Unknown store\n");
                          assert(0);
@@ -1683,6 +1671,10 @@ embedded_to_inline_constant(compiler_context *ctx)
  static void
  map_ssa_to_alias(compiler_context *ctx, int *ref)
  {
+        /* Sign is used quite deliberately for unused */
+        if (*ref < 0)
+                return;
+
          unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
  
          if (alias) {
@@ -1721,6 +1713,30 @@ midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
          return progress;
  }
  
+/* Dead code elimination for branches at the end of a block - only one branch
+ * per block is legal semantically */
+
+static void
+midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block)
+{
+        bool branched = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (!midgard_is_branch_unit(ins->unit)) continue;
+
+                /* We ignore prepacked branches since the fragment epilogue is
+                 * just generally special */
+                if (ins->prepacked_branch) continue;
+
+                if (branched) {
+                        /* We already branched, so this is dead */
+                        mir_remove_instruction(ins);
+                }
+
+                branched = true;
+        }
+}
+
  static bool
  mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
  {
@@ -1736,6 +1752,18 @@ mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
          return false;
  }
  
+static bool
+mir_nontrivial_source2_mod(midgard_instruction *ins)
+{
+        unsigned mask = squeeze_writemask(ins->alu.mask);
+        bool is_int = midgard_is_integer_op(ins->alu.op);
+
+        midgard_vector_alu_src src2 =
+                vector_alu_from_unsigned(ins->alu.src2);
+
+        return mir_nontrivial_mod(src2, is_int, mask);
+}
+
  static bool
  midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
  {
@@ -1759,14 +1787,7 @@ midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
                  if (ins->ssa_args.inline_constant) continue;
                  if (ins->has_constants) continue;
  
-                /* Also, if the move has side effects, we're helpless */
-
-                midgard_vector_alu_src src =
-                        vector_alu_from_unsigned(ins->alu.src2);
-                unsigned mask = squeeze_writemask(ins->alu.mask);
-                bool is_int = midgard_is_integer_op(ins->alu.op);
-
-                if (mir_nontrivial_mod(src, is_int, mask)) continue;
+                if (mir_nontrivial_source2_mod(ins)) continue;
                  if (ins->alu.outmod != midgard_outmod_none) continue;
  
                  /* We're clear -- rewrite */
@@ -1778,6 +1799,64 @@ midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
          return progress;
  }
  
+/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then
+ * the move can be propagated away entirely */
+
+static bool
+mir_compose_outmod(midgard_outmod *outmod, midgard_outmod comp)
+{
+        /* Nothing to do */
+        if (comp == midgard_outmod_none)
+                return true;
+
+        if (*outmod == midgard_outmod_none) {
+                *outmod = comp;
+                return true;
+        }
+
+        /* TODO: Compose rules */
+        return false;
+}
+
+static bool
+midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
+{
+        bool progress = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_ALU_4) continue;
+                if (ins->alu.op != midgard_alu_op_fmov) continue;
+                if (ins->alu.outmod != midgard_outmod_pos) continue;
+
+                /* TODO: Registers? */
+                unsigned src = ins->ssa_args.src1;
+                if (src >= ctx->func->impl->ssa_alloc) continue;
+
+                /* There might be a source modifier, too */
+                if (mir_nontrivial_source2_mod(ins)) continue;
+
+                /* Backpropagate the modifier */
+                mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) {
+                        if (v->type != TAG_ALU_4) continue;
+                        if (v->ssa_args.dest != src) continue;
+
+                        midgard_outmod temp = v->alu.outmod;
+                        progress |= mir_compose_outmod(&temp, ins->alu.outmod);
+
+                        /* Throw in the towel.. */
+                        if (!progress) break;
+
+                        /* Otherwise, transfer the modifier */
+                        v->alu.outmod = temp;
+                        ins->alu.outmod = midgard_outmod_none;
+
+                        break;
+                }
+        }
+
+        return progress;
+}
+
  static bool
  midgard_opt_copy_prop_tex(compiler_context *ctx, midgard_block *block)
  {
@@ -1878,40 +1957,6 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
          }
  }
  
-/* Emit varying stores late */
-
-static void
-midgard_emit_store(compiler_context *ctx, midgard_block *block) {
-        /* Iterate in reverse to get the final write, rather than the first */
-
-        mir_foreach_instr_in_block_safe_rev(block, ins) {
-                /* Check if what we just wrote needs a store */
-                int idx = ins->ssa_args.dest;
-                uintptr_t varying = ((uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_varyings, idx + 1));
-
-                if (!varying) continue;
-
-                varying -= 1;
-
-                /* We need to store to the appropriate varying, so emit the
-                 * move/store */
-
-                /* TODO: Integrate with special purpose RA (and scheduler?) */
-                bool high_varying_register = false;
-
-                midgard_instruction mov = v_fmov(idx, blank_alu_src, SSA_FIXED_REGISTER(REGISTER_VARYING_BASE + high_varying_register));
-
-                midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(high_varying_register), varying);
-                st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
-
-                mir_insert_instruction_before(mir_next_op(ins), st);
-                mir_insert_instruction_before(mir_next_op(ins), mov);
-
-                /* We no longer need to store this varying */
-                _mesa_hash_table_u64_remove(ctx->ssa_varyings, idx + 1);
-        }
-}
-
  /* If there are leftovers after the below pass, emit actual fmov
   * instructions for the slow-but-correct path */
  
@@ -2077,7 +2122,6 @@ emit_block(compiler_context *ctx, nir_block *block)
          /* Perform heavylifting for aliasing */
          actualise_ssa_to_alias(ctx);
  
-        midgard_emit_store(ctx, this_block);
          midgard_pair_load_store(ctx, this_block);
  
          /* Append fragment shader epilogue (value writeout) */
@@ -2287,16 +2331,9 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
          /* TODO: Decide this at runtime */
          ctx->uniform_cutoff = 8;
  
-        /* Assign var locations early, so the epilogue can use them if necessary */
-
-        nir_assign_var_locations(&nir->outputs, &nir->num_outputs, glsl_type_size);
-        nir_assign_var_locations(&nir->inputs, &nir->num_inputs, glsl_type_size);
-        nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, glsl_type_size);
-
          /* Initialize at a global (not block) level hash tables */
  
          ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
-        ctx->ssa_varyings = _mesa_hash_table_u64_create(NULL);
          ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
          ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
          ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL);
@@ -2307,16 +2344,22 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
          struct exec_list *varyings =
                  ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs;
  
+        unsigned max_varying = 0;
          nir_foreach_variable(var, varyings) {
                  unsigned loc = var->data.driver_location;
                  unsigned sz = glsl_type_size(var->type, FALSE);
  
-                for (int c = 0; c < sz; ++c) {
-                        program->varyings[loc + c] = var->data.location;
+                for (int c = loc; c < (loc + sz); ++c) {
+                        program->varyings[c] = var->data.location;
+                        max_varying = MAX2(max_varying, c);
                  }
          }
  
-        /* Lower gl_Position pre-optimisation */
+        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
+         * (so we don't accidentally duplicate the epilogue since mesa/st has
+         * messed with our I/O quite a bit already) */
+
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
  
          if (ctx->stage == MESA_SHADER_VERTEX)
                  NIR_PASS_V(nir, nir_lower_viewport_transform);
@@ -2349,7 +2392,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
          memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count);
  
          program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
-        program->varying_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_outputs : ((ctx->stage == MESA_SHADER_FRAGMENT) ? nir->num_inputs : 0);
+        program->varying_count = max_varying + 1; /* Fencepost off-by-one */
  
          nir_foreach_function(func, nir) {
                  if (!func->impl)
@@ -2375,12 +2418,20 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
                  progress = false;
  
                  mir_foreach_block(ctx, block) {
+                        progress |= midgard_opt_pos_propagate(ctx, block);
                          progress |= midgard_opt_copy_prop(ctx, block);
                          progress |= midgard_opt_copy_prop_tex(ctx, block);
                          progress |= midgard_opt_dead_code_eliminate(ctx, block);
                  }
          } while (progress);
  
+        /* Nested control-flow can result in dead branches at the end of the
+         * block. This messes with our analysis and is just dead code, so cull
+         * them */
+        mir_foreach_block(ctx, block) {
+                midgard_opt_cull_dead_branch(ctx, block);
+        }
+
          /* Schedule! */
          schedule_program(ctx);
  
@@ -2536,7 +2587,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
           * last is an ALU, then it's also 1... */
  
          mir_foreach_block(ctx, block) {
-                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                mir_foreach_bundle_in_block(block, bundle) {
                          int lookahead = 1;
  
                          if (current_bundle + 1 < bundle_count) {