panfrost: Identify un/pack colour opcodes

[mesa.git] / src / panfrost / midgard / midgard_compile.c
diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c

index 9afe96f0cfc12bb95493cca3f3d81e65457d754a..d536f66449bcdccb8fc55d4dd219d6937405c164 100644 (file)
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -69,13 +69,6 @@ int midgard_debug = 0;
                 do { if (midgard_debug & MIDGARD_DBG_MSGS) \
                         fprintf(stderr, "%s:%d: "fmt, \
                                 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
-
-static bool
-midgard_is_branch_unit(unsigned unit)
-{
-        return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT);
-}
-
  static midgard_block *
  create_empty_block(compiler_context *ctx)
  {
@@ -212,11 +205,12 @@ M_LOAD(ld_vary_32);
  M_LOAD(ld_ubo_int4);
  M_LOAD(ld_int4);
  M_STORE(st_int4);
-M_LOAD(ld_color_buffer_8);
+M_LOAD(ld_color_buffer_32u);
  //M_STORE(st_vary_16);
  M_STORE(st_vary_32);
  M_LOAD(ld_cubemap_coords);
  M_LOAD(ld_compute_id);
+M_LOAD(pack_colour);
  
  static midgard_instruction
  v_branch(bool conditional, bool invert)
@@ -494,6 +488,7 @@ optimise_nir(nir_shader *nir, unsigned quirks)
                  NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
  
                  NIR_PASS(progress, nir, nir_copy_prop);
+                NIR_PASS(progress, nir, nir_opt_remove_phis);
                  NIR_PASS(progress, nir, nir_opt_dce);
                  NIR_PASS(progress, nir, nir_opt_dead_cf);
                  NIR_PASS(progress, nir, nir_opt_cse);
@@ -1128,13 +1123,14 @@ mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read)
  /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
   * optimized) versions of UBO #0 */
  
-midgard_instruction *
+static midgard_instruction *
  emit_ubo_read(
          compiler_context *ctx,
          nir_instr *instr,
          unsigned dest,
          unsigned offset,
          nir_src *indirect_offset,
+        unsigned indirect_shift,
          unsigned index)
  {
          /* TODO: half-floats */
@@ -1147,7 +1143,7 @@ emit_ubo_read(
  
          if (indirect_offset) {
                  ins.src[2] = nir_src_index(ctx, indirect_offset);
-                ins.load_store.arg_2 = 0x80;
+                ins.load_store.arg_2 = (indirect_shift << 5);
          } else {
                  ins.load_store.arg_2 = 0x1E;
          }
@@ -1320,7 +1316,7 @@ emit_sysval_read(compiler_context *ctx, nir_instr *instr, signed dest_override,
  
          /* Emit the read itself -- this is never indirect */
          midgard_instruction *ins =
-                emit_ubo_read(ctx, instr, dest, uniform * 16, NULL, 0);
+                emit_ubo_read(ctx, instr, dest, uniform * 16, NULL, 0, 0);
  
          ins->mask = mask_of(nr_components);
  }
@@ -1338,11 +1334,6 @@ compute_builtin_arg(nir_op op)
          }
  }
  
-/* Emit store for a fragment shader, which is encoded via a fancy branch. TODO:
- * Handle MRT here */
-static void
-emit_fragment_epilogue(compiler_context *ctx, unsigned rt);
-
  static void
  emit_fragment_store(compiler_context *ctx, unsigned src, unsigned rt)
  {
@@ -1360,9 +1351,15 @@ emit_fragment_store(compiler_context *ctx, unsigned src, unsigned rt)
          /* Emit the branch */
          midgard_instruction *br = emit_mir_instruction(ctx, ins);
          schedule_barrier(ctx);
-        br->branch.target_block = ctx->block_count - 1;
  
-        emit_fragment_epilogue(ctx, rt);
+        assert(rt < ARRAY_SIZE(ctx->writeout_branch));
+        assert(!ctx->writeout_branch[rt]);
+        ctx->writeout_branch[rt] = br;
+
+        /* Push our current location = current block count - 1 = where we'll
+         * jump to. Maybe a bit too clever for my own good */
+
+        br->branch.target_block = ctx->block_count - 1;
  }
  
  static void
@@ -1456,22 +1453,15 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                  reg = nir_dest_index(ctx, &instr->dest);
  
                  if (is_uniform && !ctx->is_blend) {
-                        emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysval_count + offset) * 16, indirect_offset, 0);
+                        emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysval_count + offset) * 16, indirect_offset, 4, 0);
                  } else if (is_ubo) {
                          nir_src index = instr->src[0];
  
-                        /* We don't yet support indirect UBOs. For indirect
-                         * block numbers (if that's possible), we don't know
-                         * enough about the hardware yet. For indirect sources,
-                         * we know what we need but we need to add some NIR
-                         * support for lowering correctly with respect to
-                         * 128-bit reads */
-
+                        /* TODO: Is indirect block number possible? */
                          assert(nir_src_is_const(index));
-                        assert(nir_src_is_const(*src_offset));
  
                          uint32_t uindex = nir_src_as_uint(index) + 1;
-                        emit_ubo_read(ctx, &instr->instr, reg, offset, NULL, uindex);
+                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex);
                  } else if (is_ssbo) {
                          nir_src index = instr->src[0];
                          assert(nir_src_is_const(index));
@@ -1511,7 +1501,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                  /* T720 and below use different blend opcodes with slightly
                   * different semantics than T760 and up */
  
-                midgard_instruction ld = m_ld_color_buffer_8(reg, 0);
+                midgard_instruction ld = m_ld_color_buffer_32u(reg, 0);
                  bool old_blend = ctx->quirks & MIDGARD_OLD_BLEND;
  
                  if (instr->intrinsic == nir_intrinsic_load_output_u8_as_fp16_pan) {
@@ -1553,7 +1543,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                  reg = nir_src_index(ctx, &instr->src[0]);
  
                  if (ctx->stage == MESA_SHADER_FRAGMENT) {
-                        /* Determine number of render targets */
                          emit_fragment_store(ctx, reg, offset);
                  } else if (ctx->stage == MESA_SHADER_VERTEX) {
                          /* We should have been vectorized, though we don't
@@ -1566,7 +1555,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
  
                          emit_explicit_constant(ctx, reg, reg);
  
-                        unsigned component = nir_intrinsic_component(instr);
+                        unsigned dst_component = nir_intrinsic_component(instr);
                          unsigned nr_comp = nir_src_num_components(instr->src[0]);
  
                          midgard_instruction st = m_st_vary_32(reg, offset);
@@ -1589,8 +1578,20 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                                  break;
                          }
  
-                        for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle[0]); ++i)
-                                st.swizzle[0][i] = MIN2(i + component, nr_comp);
+                        /* nir_intrinsic_component(store_intr) encodes the
+                         * destination component start. Source component offset
+                         * adjustment is taken care of in
+                         * install_registers_instr(), when offset_swizzle() is
+                         * called.
+                         */
+                        unsigned src_component = COMPONENT_X;
+
+                        assert(nr_comp > 0);
+                        for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) {
+                                st.swizzle[0][i] = src_component;
+                                if (i >= dst_component && i < dst_component + nr_comp - 1)
+                                        src_component++;
+                        }
  
                          emit_mir_instruction(ctx, st);
                  } else {
@@ -1791,6 +1792,11 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
  
                          unsigned coord_mask = mask_of(instr->coord_components);
  
+                        bool flip_zw = (instr->sampler_dim == GLSL_SAMPLER_DIM_2D) && (coord_mask & (1 << COMPONENT_Z));
+
+                        if (flip_zw)
+                                coord_mask ^= ((1 << COMPONENT_Z) | (1 << COMPONENT_W));
+
                          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
                                  /* texelFetch is undefined on samplerCube */
                                  assert(midgard_texop != TEXTURE_OP_TEXEL_FETCH);
@@ -1813,6 +1819,10 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
                                  /* mov coord_temp, coords */
                                  midgard_instruction mov = v_mov(index, coords);
                                  mov.mask = coord_mask;
+
+                                if (flip_zw)
+                                        mov.swizzle[1][COMPONENT_W] = COMPONENT_Z;
+
                                  emit_mir_instruction(ctx, mov);
                          } else {
                                  coords = index;
@@ -1835,10 +1845,13 @@ emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
                          }
  
                          if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D) {
-                                /* Array component in w but NIR wants it in z */
+                                /* Array component in w but NIR wants it in z,
+                                 * but if we have a temp coord we already fixed
+                                 * that up */
+
                                  if (nr_components == 3) {
                                          ins.swizzle[1][2] = COMPONENT_Z;
-                                        ins.swizzle[1][3] = COMPONENT_Z;
+                                        ins.swizzle[1][3] = needs_temp_coord ? COMPONENT_W : COMPONENT_Z;
                                  } else if (nr_components == 2) {
                                          ins.swizzle[1][2] =
                                                  instr->is_shadow ? COMPONENT_Z : COMPONENT_X;
@@ -2291,28 +2304,20 @@ midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
          return progress;
  }
  
-static void
+static unsigned
  emit_fragment_epilogue(compiler_context *ctx, unsigned rt)
  {
-        /* Include a move to specify the render target */
-
-        if (rt > 0) {
-                midgard_instruction rt_move = v_mov(SSA_FIXED_REGISTER(1),
-                                SSA_FIXED_REGISTER(1));
-                rt_move.mask = 1 << COMPONENT_Z;
-                rt_move.unit = UNIT_SADD;
-                emit_mir_instruction(ctx, rt_move);
-        }
-
          /* Loop to ourselves */
  
          struct midgard_instruction ins = v_branch(false, false);
          ins.writeout = true;
          ins.branch.target_block = ctx->block_count - 1;
+        ins.constants[0] = rt * 0x100;
          emit_mir_instruction(ctx, ins);
  
          ctx->current_block->epilogue = true;
          schedule_barrier(ctx);
+        return ins.branch.target_block;
  }
  
  static midgard_block *
@@ -2564,6 +2569,36 @@ pan_format_from_glsl(const struct glsl_type *type)
                  MALI_NR_CHANNELS(4);
  }
  
+/* For each fragment writeout instruction, generate a writeout loop to
+ * associate with it */
+
+static void
+mir_add_writeout_loops(compiler_context *ctx)
+{
+        for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) {
+                midgard_instruction *br = ctx->writeout_branch[rt];
+                if (!br) continue;
+
+                unsigned popped = br->branch.target_block;
+                midgard_block_add_successor(mir_get_block(ctx, popped - 1), ctx->current_block);
+                br->branch.target_block = emit_fragment_epilogue(ctx, rt);
+
+                /* If we have more RTs, we'll need to restore back after our
+                 * loop terminates */
+
+                if ((rt + 1) < ARRAY_SIZE(ctx->writeout_branch) && ctx->writeout_branch[rt + 1]) {
+                        midgard_instruction uncond = v_branch(false, false);
+                        uncond.branch.target_block = popped;
+                        emit_mir_instruction(ctx, uncond);
+                        midgard_block_add_successor(ctx->current_block, mir_get_block(ctx, popped));
+                        schedule_barrier(ctx);
+                } else {
+                        /* We're last, so we can terminate here */
+                        br->last_writeout = true;
+                }
+        }
+}
+
  int
  midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb)
  {
@@ -2686,6 +2721,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
                          progress |= midgard_opt_fuse_dest_invert(ctx, block);
                          progress |= midgard_opt_csel_invert(ctx, block);
                          progress |= midgard_opt_drop_cmp_invert(ctx, block);
+                        progress |= midgard_opt_invert_branch(ctx, block);
                  }
          } while (progress);
  
@@ -2706,8 +2742,11 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
                  assert(!ins->invert);
          }
  
+        if (ctx->stage == MESA_SHADER_FRAGMENT)
+                mir_add_writeout_loops(ctx);
+
          /* Schedule! */
-        schedule_program(ctx);
+        midgard_schedule_program(ctx);
          mir_ra(ctx);
  
          /* Now that all the bundles are scheduled and we can calculate block
@@ -2842,22 +2881,14 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl
  
          /* Midgard prefetches instruction types, so during emission we
           * need to lookahead. Unless this is the last instruction, in
-         * which we return 1. Or if this is the second to last and the
-         * last is an ALU, then it's also 1... */
+         * which we return 1. */
  
          mir_foreach_block(ctx, block) {
                  mir_foreach_bundle_in_block(block, bundle) {
                          int lookahead = 1;
  
-                        if (current_bundle + 1 < bundle_count) {
-                                uint8_t next = source_order_bundles[current_bundle + 1]->tag;
-
-                                if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
-                                        lookahead = 1;
-                                } else {
-                                        lookahead = next;
-                                }
-                        }
+                        if (!bundle->last_writeout && (current_bundle + 1 < bundle_count))
+                                lookahead = source_order_bundles[current_bundle + 1]->tag;
  
                          emit_binary_bundle(ctx, bundle, compiled, lookahead);
                          ++current_bundle;