pan/midgard: Fold ssa_args into midgard_instruction

[mesa.git] / src / panfrost / midgard / midgard_schedule.c
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c

index 862b9306c15ca069dcc011157be4db93d20fe83d..f80a0354fb88957c5a22477bafebd6bc5844fdd0 100644 (file)
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -64,36 +64,38 @@ is_single_component_mask(unsigned mask)
  static bool
  can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
  {
+        /* Writeout has its own rules anyway */
+        if (first->compact_branch || second->compact_branch)
+                return true;
+
          /* Each instruction reads some registers and writes to a register. See
           * where the first writes */
  
-        /* Figure out where exactly we wrote to */
-        int source = first->ssa_args.dest;
+        int source = first->dest;
          int source_mask = first->mask;
  
          /* As long as the second doesn't read from the first, we're okay */
-        if (second->ssa_args.src0 == source) {
-                if (first->type == TAG_ALU_4) {
-                        /* Figure out which components we just read from */
-
-                        int q = second->alu.src1;
-                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+        for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
+                if (second->src[i] != source)
+                        continue;
  
-                        /* Check if there are components in common, and fail if so */
-                        if (swizzle_to_access_mask(m->swizzle) & source_mask)
-                                return false;
-                } else
+                if (first->type != TAG_ALU_4)
                          return false;
  
-        }
+                /* Figure out which components we just read from */
  
-        if (second->ssa_args.src1 == source)
-                return false;
+                int q = (i == 0) ? second->alu.src1 : second->alu.src2;
+                midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+
+                /* Check if there are components in common, and fail if so */
+                if (swizzle_to_access_mask(m->swizzle) & source_mask)
+                        return false;
+        }
  
          /* Otherwise, it's safe in that regard. Another data hazard is both
           * writing to the same place, of course */
  
-        if (second->ssa_args.dest == source) {
+        if (second->dest == source) {
                  /* ...but only if the components overlap */
  
                  if (second->mask & source_mask)
@@ -118,6 +120,81 @@ midgard_has_hazard(
  
  }
  
+/* Fragment writeout (of r0) is allowed when:
+ *
+ *  - All components of r0 are written in the bundle
+ *  - No components of r0 are written in VLUT
+ *  - Non-pipelined dependencies of r0 are not written in the bundle
+ *
+ * This function checks if these requirements are satisfied given the content
+ * of a scheduled bundle.
+ */
+
+static bool
+can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count)
+{
+        /* First scan for which components of r0 are written out. Initially
+         * none are written */
+
+        uint8_t r0_written_mask = 0x0;
+
+        /* Simultaneously we scan for the set of dependencies */
+
+        size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
+        BITSET_WORD *dependencies = alloca(sz);
+        memset(dependencies, 0, sz);
+
+        for (unsigned i = 0; i < count; ++i) {
+                midgard_instruction *ins = bundle[i];
+
+                if (ins->dest != SSA_FIXED_REGISTER(0))
+                        continue;
+
+                /* Record written out mask */
+                r0_written_mask |= ins->mask;
+
+                /* Record dependencies, but only if they won't become pipeline
+                 * registers. We know we can't be live after this, because
+                 * we're writeout at the very end of the shader. So check if
+                 * they were written before us. */
+
+                unsigned src0 = ins->src[0];
+                unsigned src1 = ins->src[1];
+
+                if (!mir_is_written_before(ctx, bundle[0], src0))
+                        src0 = ~0;
+
+                if (!mir_is_written_before(ctx, bundle[0], src1))
+                        src1 = ~0;
+
+                if (src0 < node_count)
+                        BITSET_SET(dependencies, src0);
+
+                if (src1 < node_count)
+                        BITSET_SET(dependencies, src1);
+
+                /* Requirement 2 */
+                if (ins->unit == UNIT_VLUT)
+                        return false;
+        }
+
+        /* Requirement 1 */
+        if ((r0_written_mask & 0xF) != 0xF)
+                return false;
+
+        /* Requirement 3 */
+
+        for (unsigned i = 0; i < count; ++i) {
+                unsigned dest = bundle[i]->dest;
+
+                if (dest < node_count && BITSET_TEST(dependencies, dest))
+                        return false;
+        }
+
+        /* Otherwise, we're good to go */
+        return true;
+}
+
  /* Schedules, but does not emit, a single basic block. After scheduling, the
   * final tag and size of the block are known, which are necessary for branching
   * */
@@ -128,6 +205,8 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
          int instructions_emitted = 0, packed_idx = 0;
          midgard_bundle bundle = { 0 };
  
+        midgard_instruction *scheduled[5] = { NULL };
+
          uint8_t tag = ins->type;
  
          /* Default to the instruction's tag */
@@ -211,13 +290,10 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
  
                                          could_scalar &= !s1.half;
  
-                                        if (!ains->ssa_args.inline_constant) {
-                                                midgard_vector_alu_src s2 =
-                                                        vector_alu_from_unsigned(ains->alu.src2);
-
-                                                could_scalar &= !s2.half;
-                                        }
+                                        midgard_vector_alu_src s2 =
+                                                vector_alu_from_unsigned(ains->alu.src2);
  
+                                        could_scalar &= !s2.half;
                                  }
  
                                  bool scalar = could_scalar && scalarable;
@@ -253,12 +329,16 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
                                                  else
                                                          break;
                                          } else {
-                                                if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
+                                                if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
+                                                        unit = UNIT_VMUL;
+                                                else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
                                                          unit = UNIT_SADD;
-                                                else if (units & UNIT_SMUL)
-                                                        unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
-                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                else if (units & UNIT_VADD)
                                                          unit = UNIT_VADD;
+                                                else if (units & UNIT_SMUL)
+                                                        unit = UNIT_SMUL;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
                                                  else
                                                          break;
                                          }
@@ -370,10 +450,10 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
                                  unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
                                  unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
  
-                                if (ains->ssa_args.src0 == r_constant)
+                                if (ains->src[0] == r_constant)
                                          ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
  
-                                if (ains->ssa_args.src1 == r_constant)
+                                if (ains->src[1] == r_constant)
                                          ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
  
                                  bundle.has_embedded_constants = true;
@@ -386,15 +466,10 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
                                  /* All of r0 has to be written out along with
                                   * the branch writeout */
  
-                                if (ains->writeout) {
-                                        /* The rules for when "bare" writeout
-                                         * is safe are when all components are
-                                         * r0 are written out in the final
-                                         * bundle, earlier than VLUT, where any
-                                         * register dependencies of r0 are from
-                                         * an earlier bundle. We can't verify
-                                         * this before RA, so we don't try. */
-
+                                if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count)) {
+                                        /* We only work on full moves
+                                         * at the beginning. We could
+                                         * probably do better */
                                          if (index != 0)
                                                  break;
  
@@ -422,6 +497,7 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
                          }
  
                          /* Defer marking until after writing to allow for break */
+                        scheduled[index] = ains;
                          control |= ains->unit;
                          last_unit = ains->unit;
                          ++instructions_emitted;
@@ -552,18 +628,15 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
  
                                  if (c->type != TAG_LOAD_STORE_4) continue;
  
-                                /* Stores cannot be reordered, since they have
-                                 * dependencies. For the same reason, indirect
-                                 * loads cannot be reordered as their index is
-                                 * loaded in r27.w */
+                                /* We can only reorder if there are no sources */
  
-                                if (OP_IS_STORE(c->load_store.op)) continue;
+                                bool deps = false;
  
-                                /* It appears the 0x800 bit is set whenever a
-                                 * load is direct, unset when it is indirect.
-                                 * Skip indirect loads. */
+                                for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
+                                        deps |= (c->src[s] != ~0);
  
-                                if (!(c->load_store.unknown & 0x800)) continue;
+                                if (deps)
+                                        continue;
  
                                  /* We found one! Move it up to pair and remove it from the old location */
  
@@ -582,7 +655,7 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
  static unsigned
  find_or_allocate_temp(compiler_context *ctx, unsigned hash)
  {
-        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+        if (hash >= SSA_FIXED_MINIMUM)
                  return hash;
  
          unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
@@ -612,14 +685,10 @@ mir_squeeze_index(compiler_context *ctx)
          ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
  
          mir_foreach_instr_global(ctx, ins) {
-                if (ins->compact_branch) continue;
-
-                ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
-                ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
-
-                if (!ins->ssa_args.inline_constant)
-                        ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+                ins->dest = find_or_allocate_temp(ctx, ins->dest);
  
+                for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
+                        ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
          }
  }
  
@@ -636,132 +705,225 @@ v_load_store_scratch(
          midgard_instruction ins = {
                  .type = TAG_LOAD_STORE_4,
                  .mask = mask,
-                .ssa_args = {
-                        .dest = -1,
-                        .src0 = -1,
-                        .src1 = -1
-                },
+                .dest = ~0,
+                .src = { ~0, ~0, ~0 },
                  .load_store = {
                          .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
                          .swizzle = SWIZZLE_XYZW,
  
                          /* For register spilling - to thread local storage */
-                        .unknown = 0x1EEA,
+                        .arg_1 = 0xEA,
+                        .arg_2 = 0x1E,
  
                          /* Splattered across, TODO combine logically */
                          .varying_parameters = (byte & 0x1FF) << 1,
                          .address = (byte >> 9)
-                }
+                },
+
+                /* If we spill an unspill, RA goes into an infinite loop */
+                .no_spill = true
          };
  
         if (is_store) {
                  /* r0 = r26, r1 = r27 */
                  assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
-                ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
+                ins.src[0] = srcdest;
          } else {
-                ins.ssa_args.dest = srcdest;
+                ins.dest = srcdest;
          }
  
          return ins;
  }
  
-void
-schedule_program(compiler_context *ctx)
+/* If register allocation fails, find the best spill node and spill it to fix
+ * whatever the issue was. This spill node could be a work register (spilling
+ * to thread local storage), but it could also simply be a special register
+ * that needs to spill to become a work register. */
+
+static void mir_spill_register(
+                compiler_context *ctx,
+                struct ra_graph *g,
+                unsigned *spill_count)
  {
-        struct ra_graph *g = NULL;
-        bool spilled = false;
-        int iter_count = 1000; /* max iterations */
+        unsigned spill_index = ctx->temp_count;
  
-        /* Number of 128-bit slots in memory we've spilled into */
-        unsigned spill_count = 0;
+        /* Our first step is to calculate spill cost to figure out the best
+         * spill node. All nodes are equal in spill cost, but we can't spill
+         * nodes written to from an unspill */
  
-        midgard_promote_uniforms(ctx, 8);
+        for (unsigned i = 0; i < ctx->temp_count; ++i) {
+                ra_set_node_spill_cost(g, i, 1.0);
+        }
  
-        mir_foreach_block(ctx, block) {
-                midgard_pair_load_store(ctx, block);
+        mir_foreach_instr_global(ctx, ins) {
+                if (ins->no_spill &&
+                    ins->dest >= 0 &&
+                    ins->dest < ctx->temp_count)
+                        ra_set_node_spill_cost(g, ins->dest, -1.0);
          }
  
-        do {
-                /* If we spill, find the best spill node and spill it */
+        int spill_node = ra_get_best_spill_node(g);
  
-                unsigned spill_index = ctx->temp_count;
-                if (g && spilled) {
-                        /* All nodes are equal in spill cost, but we can't
-                         * spill nodes written to from an unspill */
+        if (spill_node < 0) {
+                mir_print_shader(ctx);
+                assert(0);
+        }
  
-                        for (unsigned i = 0; i < ctx->temp_count; ++i) {
-                                ra_set_node_spill_cost(g, i, 1.0);
-                        }
+        /* We have a spill node, so check the class. Work registers
+         * legitimately spill to TLS, but special registers just spill to work
+         * registers */
  
-                        mir_foreach_instr_global(ctx, ins) {
-                                if (ins->type != TAG_LOAD_STORE_4)  continue;
-                                if (ins->load_store.op != midgard_op_ld_int4) continue;
-                                if (ins->load_store.unknown != 0x1EEA) continue;
-                                ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
-                        }
+        unsigned class = ra_get_node_class(g, spill_node);
+        bool is_special = (class >> 2) != REG_CLASS_WORK;
+        bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
  
-                        int spill_node = ra_get_best_spill_node(g);
+        /* Allocate TLS slot (maybe) */
+        unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
  
-                        if (spill_node < 0) {
-                                mir_print_shader(ctx);
-                                assert(0);
-                        }
+        /* For TLS, replace all stores to the spilled node. For
+         * special reads, just keep as-is; the class will be demoted
+         * implicitly. For special writes, spill to a work register */
  
-                        /* Allocate TLS slot */
-                        unsigned spill_slot = spill_count++;
+        if (!is_special || is_special_w) {
+                if (is_special_w)
+                        spill_slot = spill_index++;
  
-                        /* Replace all stores to the spilled node with stores
-                         * to TLS */
+                mir_foreach_instr_global_safe(ctx, ins) {
+                        if (ins->dest != spill_node) continue;
  
-                        mir_foreach_instr_global_safe(ctx, ins) {
-                                if (ins->compact_branch) continue;
-                                if (ins->ssa_args.dest != spill_node) continue;
-                                ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
+                        midgard_instruction st;
  
-                                midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
-                                mir_insert_instruction_before(mir_next_op(ins), st);
+                        if (is_special_w) {
+                                st = v_mov(spill_node, blank_alu_src, spill_slot);
+                                st.no_spill = true;
+                        } else {
+                                ins->dest = SSA_FIXED_REGISTER(26);
+                                st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
                          }
  
-                        /* Insert a load from TLS before the first consecutive
-                         * use of the node, rewriting to use spilled indices to
-                         * break up the live range */
+                        /* Hint: don't rewrite this node */
+                        st.hint = true;
  
-                        mir_foreach_block(ctx, block) {
+                        mir_insert_instruction_before(mir_next_op(ins), st);
  
-                        bool consecutive_skip = false;
-                        unsigned consecutive_index = 0;
+                        if (!is_special)
+                                ctx->spills++;
+                }
+        }
  
-                        mir_foreach_instr_in_block(block, ins) {
-                                if (ins->compact_branch) continue;
-                                
-                                if (!mir_has_arg(ins, spill_node)) {
-                                        consecutive_skip = false;
-                                        continue;
-                                }
+        /* For special reads, figure out how many components we need */
+        unsigned read_mask = 0;
  
-                                if (consecutive_skip) {
-                                        /* Rewrite */
-                                        mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
-                                        continue;
-                                }
+        mir_foreach_instr_global_safe(ctx, ins) {
+                read_mask |= mir_mask_of_read_components(ins, spill_node);
+        }
  
+        /* Insert a load from TLS before the first consecutive
+         * use of the node, rewriting to use spilled indices to
+         * break up the live range. Or, for special, insert a
+         * move. Ironically the latter *increases* register
+         * pressure, but the two uses of the spilling mechanism
+         * are somewhat orthogonal. (special spilling is to use
+         * work registers to back special registers; TLS
+         * spilling is to use memory to back work registers) */
+
+        mir_foreach_block(ctx, block) {
+                bool consecutive_skip = false;
+                unsigned consecutive_index = 0;
+
+                mir_foreach_instr_in_block(block, ins) {
+                        /* We can't rewrite the moves used to spill in the
+                         * first place. These moves are hinted. */
+                        if (ins->hint) continue;
+
+                        if (!mir_has_arg(ins, spill_node)) {
+                                consecutive_skip = false;
+                                continue;
+                        }
+
+                        if (consecutive_skip) {
+                                /* Rewrite */
+                                mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+                                continue;
+                        }
+
+                        if (!is_special_w) {
                                  consecutive_index = ++spill_index;
-                                midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+
                                  midgard_instruction *before = ins;
  
                                  /* For a csel, go back one more not to break up the bundle */
                                  if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
                                          before = mir_prev_op(before);
  
+                                midgard_instruction st;
+
+                                if (is_special) {
+                                        /* Move */
+                                        st = v_mov(spill_node, blank_alu_src, consecutive_index);
+                                        st.no_spill = true;
+                                } else {
+                                        /* TLS load */
+                                        st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+                                }
+
+                                /* Mask the load based on the component count
+                                 * actually needed to prvent RA loops */
+
+                                st.mask = read_mask;
+
                                  mir_insert_instruction_before(before, st);
                                 // consecutive_skip = true;
+                        } else {
+                                /* Special writes already have their move spilled in */
+                                consecutive_index = spill_slot;
+                        }
  
  
-                                /* Rewrite to use */
-                                mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
-                        }
-                        }
+                        /* Rewrite to use */
+                        mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+
+                        if (!is_special)
+                                ctx->fills++;
                  }
+        }
+
+        /* Reset hints */
+
+        mir_foreach_instr_global(ctx, ins) {
+                ins->hint = false;
+        }
+}
+
+void
+schedule_program(compiler_context *ctx)
+{
+        struct ra_graph *g = NULL;
+        bool spilled = false;
+        int iter_count = 1000; /* max iterations */
+
+        /* Number of 128-bit slots in memory we've spilled into */
+        unsigned spill_count = 0;
+
+        midgard_promote_uniforms(ctx, 16);
+
+        mir_foreach_block(ctx, block) {
+                midgard_pair_load_store(ctx, block);
+        }
+
+        /* Must be lowered right before RA */
+        mir_squeeze_index(ctx);
+        mir_lower_special_reads(ctx);
+
+        /* Lowering can introduce some dead moves */
+
+        mir_foreach_block(ctx, block) {
+                midgard_opt_dead_move_eliminate(ctx, block);
+        }
+
+        do {
+                if (spilled) 
+                        mir_spill_register(ctx, g, &spill_count);
  
                  mir_squeeze_index(ctx);
  
@@ -769,19 +931,24 @@ schedule_program(compiler_context *ctx)
                  g = allocate_registers(ctx, &spilled);
          } while(spilled && ((iter_count--) > 0));
  
-                /* We would like to run RA after scheduling, but spilling can
-                 * complicate this */
+        /* We can simplify a bit after RA */
  
-                mir_foreach_block(ctx, block) {
-                        schedule_block(ctx, block);
-                }
-#if 0
+        mir_foreach_block(ctx, block) {
+                midgard_opt_post_move_eliminate(ctx, block, g);
+        }
+
+        /* After RA finishes, we schedule all at once */
  
-                /* Pipeline registers creation is a prepass before RA */
-                mir_create_pipeline_registers(ctx);
-#endif
+        mir_foreach_block(ctx, block) {
+                schedule_block(ctx, block);
+        }
  
+        /* Finally, we create pipeline registers as a peephole pass after
+         * scheduling. This isn't totally optimal, since there are cases where
+         * the usage of pipeline registers can eliminate spills, but it does
+         * save some power */
  
+        mir_create_pipeline_registers(ctx);
  
          if (iter_count <= 0) {
                  fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");