v3d: acquire scoreboard lock before first tlb read
[mesa.git] / src / broadcom / compiler / nir_to_vir.c
index 67747b14bb078e4a5fbdaea922e362aefe8ce3fc..4f12110ded198bc9d7a8ee3ad67e594bb50c96de 100644 (file)
@@ -122,6 +122,13 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw = vir_NOP(c);
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
+
+        /* We need to lock the scoreboard before any tlb acess happens. If this
+         * thread switch comes after we have emitted a tlb load, then it means
+         * that we can't lock on the last thread switch any more.
+         */
+        if (c->emitted_tlb_load)
+                c->lock_scoreboard_on_first_thrsw = true;
 }
 
 static uint32_t
@@ -827,8 +834,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
         struct qreg result;
 
         switch (instr->op) {
-        case nir_op_fmov:
-        case nir_op_imov:
+        case nir_op_mov:
                 result = vir_MOV(c, src[0]);
                 break;
 
@@ -1310,7 +1316,7 @@ v3d_optimize_nir(struct nir_shader *s)
                 progress = false;
 
                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
-                NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL);
                 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
                 NIR_PASS(progress, s, nir_copy_prop);
                 NIR_PASS(progress, s, nir_opt_remove_phis);
@@ -1454,6 +1460,26 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
         }
 }
 
+static bool
+var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
+{
+        return (var->data.location == VARYING_SLOT_PNTC ||
+                (var->data.location >= VARYING_SLOT_VAR0 &&
+                 (c->fs_key->point_sprite_mask &
+                  (1 << (var->data.location - VARYING_SLOT_VAR0)))));
+}
+
+static bool
+program_reads_point_coord(struct v3d_compile *c)
+{
+        nir_foreach_variable(var, &c->s->inputs) {
+                if (var_needs_point_coord(c, var))
+                        return true;
+        }
+
+        return false;
+}
+
 static void
 ntq_setup_fs_inputs(struct v3d_compile *c)
 {
@@ -1487,11 +1513,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
 
                 if (var->data.location == VARYING_SLOT_POS) {
                         emit_fragcoord_input(c, loc);
-                } else if (var->data.location == VARYING_SLOT_PNTC ||
-                           (var->data.location >= VARYING_SLOT_VAR0 &&
-                            (c->fs_key->point_sprite_mask &
-                             (1 << (var->data.location -
-                                    VARYING_SLOT_VAR0))))) {
+                } else if (var_needs_point_coord(c, var)) {
                         c->inputs[loc * 4 + 0] = c->point_x;
                         c->inputs[loc * 4 + 1] = c->point_y;
                 } else {
@@ -1617,6 +1639,123 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
         }
 }
 
+static void
+vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+
+        int rt = nir_src_as_uint(instr->src[0]);
+        assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+        int sample_index = nir_intrinsic_base(instr) ;
+        assert(sample_index == 0); /* XXX: multisample */
+
+        int component = nir_intrinsic_component(instr);
+        assert(component < 4);
+
+        /* We need to emit our TLB reads after we have acquired the scoreboard
+         * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+         * the last thread switch to improve parallelism, however, that is only
+         * guaranteed to happen before the tlb color writes.
+         *
+         * To fix that, we make sure we always emit a thread switch before the
+         * first tlb color read. If that happens to be the last thread switch
+         * we emit, then everything is fine, but otherwsie, if any code after
+         * this point needs to emit additional thread switches, then we will
+         * switch the strategy to locking the scoreboard on the first thread
+         * switch instead -- see vir_emit_thrsw().
+         */
+        if (!c->emitted_tlb_load) {
+                if (!c->last_thrsw_at_top_level) {
+                        assert(c->devinfo->ver >= 41);
+                        vir_emit_thrsw(c);
+                }
+
+                c->emitted_tlb_load = true;
+        }
+
+        struct qreg *color_reads =
+                &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
+
+        if (color_reads[component].file == QFILE_NULL) {
+                enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
+                int num_components =
+                        util_format_get_nr_components(rt_format);
+
+                const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
+                if (swap_rb)
+                        num_components = MAX2(num_components, 3);
+
+                bool is_int_format = (c->fs_key->int_color_rb & (1 << rt)) ||
+                                     (c->fs_key->uint_color_rb & (1 << rt));
+
+                bool is_32b_tlb_format = is_int_format ||
+                                         (c->fs_key->f32_color_rb & (1 << rt));
+
+                uint32_t conf = 0xffffff00;
+                conf |= TLB_SAMPLE_MODE_PER_PIXEL; /* XXX: multisample */
+                conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+                if (is_32b_tlb_format) {
+                        /* The F32 vs I32 distinction was dropped in 4.2. */
+                        conf |= (c->devinfo->ver < 42 && is_int_format) ?
+                                TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
+
+                        conf |= ((num_components - 1) <<
+                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
+                } else {
+                        conf |= TLB_TYPE_F16_COLOR;
+                        conf |= TLB_F16_SWAP_HI_LO;
+
+                        if (num_components >= 3)
+                                conf |= TLB_VEC_SIZE_4_F16;
+                        else
+                                conf |= TLB_VEC_SIZE_2_F16;
+                }
+
+                struct qreg r, g, b, a;
+                if (is_32b_tlb_format) {
+                        r = conf != 0xffffffff ? vir_TLBU_COLOR_READ(c, conf) :
+                                                 vir_TLB_COLOR_READ(c);
+                        if (num_components >= 2)
+                                g = vir_TLB_COLOR_READ(c);
+                        if (num_components >= 3)
+                                b = vir_TLB_COLOR_READ(c);
+                        if (num_components >= 4)
+                                a = vir_TLB_COLOR_READ(c);
+                } else {
+                        struct qreg rg = conf != 0xffffffff ?
+                                vir_TLBU_COLOR_READ(c, conf) :
+                                vir_TLB_COLOR_READ(c);
+                        r = vir_FMOV(c, rg);
+                        vir_set_unpack(c->defs[r.index], 0, V3D_QPU_UNPACK_L);
+                        g = vir_FMOV(c, rg);
+                        vir_set_unpack(c->defs[g.index], 0, V3D_QPU_UNPACK_H);
+
+                        if (num_components > 2) {
+                            struct qreg ba = vir_TLB_COLOR_READ(c);
+                            b = vir_FMOV(c, ba);
+                            vir_set_unpack(c->defs[b.index], 0,
+                                           V3D_QPU_UNPACK_L);
+                            a = vir_FMOV(c, ba);
+                            vir_set_unpack(c->defs[a.index], 0,
+                                           V3D_QPU_UNPACK_H);
+                        }
+                }
+
+                color_reads[0] = swap_rb ? b : r;
+                if (num_components >= 2)
+                        color_reads[1] = g;
+                if (num_components >= 3)
+                        color_reads[2] = swap_rb ? r : b;
+                if (num_components >= 4)
+                        color_reads[3] = a;
+        }
+
+        assert(color_reads[component].file != QFILE_NULL);
+        ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, color_reads[component]));
+}
+
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
@@ -1759,6 +1898,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
                 break;
 
+        case nir_intrinsic_load_tlb_color_v3d:
+                vir_emit_tlb_color_read(c, instr);
+                break;
+
         case nir_intrinsic_load_input:
                 /* Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
                  * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
@@ -2000,7 +2143,6 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
 
                 /* Emit the else block. */
                 vir_set_emit_block(c, else_block);
-                ntq_activate_execute_for_block(c);
                 ntq_emit_cf_list(c, &if_stmt->else_list);
         }
 
@@ -2296,15 +2438,17 @@ nir_to_vir(struct v3d_compile *c)
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
 
-                /* XXX perf: We could set the "disable implicit point/line
-                 * varyings" field in the shader record and not emit these, if
-                 * they're not going to be used.
+                /* V3D 4.x can disable implicit point coordinate varyings if
+                 * they are not used.
                  */
-                if (c->fs_key->is_points) {
+                if (c->fs_key->is_points &&
+                    (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
                         c->point_x = emit_fragment_varying(c, NULL, 0, 0);
                         c->point_y = emit_fragment_varying(c, NULL, 0, 0);
-                } else if (c->fs_key->is_lines) {
+                        c->uses_implicit_point_line_varyings = true;
+                } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
                         c->line_x = emit_fragment_varying(c, NULL, 0, 0);
+                        c->uses_implicit_point_line_varyings = true;
                 }
                 break;
         case MESA_SHADER_COMPUTE:
@@ -2376,13 +2520,13 @@ const nir_shader_compiler_options v3d_nir_options = {
         .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
-        .lower_bfm = true,
         .lower_bitfield_insert_to_shifts = true,
         .lower_bitfield_extract_to_shifts = true,
         .lower_bitfield_reverse = true,
         .lower_bit_count = true,
         .lower_cs_local_id_from_index = true,
         .lower_ffract = true,
+        .lower_fmod = true,
         .lower_pack_unorm_2x16 = true,
         .lower_pack_snorm_2x16 = true,
         .lower_pack_unorm_4x8 = true,
@@ -2403,6 +2547,7 @@ const nir_shader_compiler_options v3d_nir_options = {
         .lower_ldexp = true,
         .lower_mul_high = true,
         .lower_wpos_pntc = true,
+        .lower_rotate = true,
 };
 
 /**