v3d: acquire scoreboard lock before first tlb read
authorIago Toral Quiroga <itoral@igalia.com>
Thu, 4 Jul 2019 10:22:40 +0000 (12:22 +0200)
committerIago Toral Quiroga <itoral@igalia.com>
Fri, 12 Jul 2019 07:16:38 +0000 (09:16 +0200)
Until now we have always been emitting our scoreboard locks on the last thread
switch to improve parallelism. We did this by emitting our last thread switch
right before our tlb writes at the very end of the program, where we know that
we are outside control flow.

Unfortunately, this strategy is not valid when we have tlb color reads too, as
these will happen before this point in the program and can happen inside
control flow.

To fix this we always emit a thread switch before the first tlb load and if we
see additional thread switches after that point, we change the strategy to lock
on the first thread switch.

v2: change the solution so it is expected to work in more scenarios (Eric).

Reviewed-by: Eric Anholt <eric@anholt.net>
src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir.c
src/gallium/drivers/v3d/v3dx_draw.c

index 75622338aa2063759b8947f90e690925a870123d..4f12110ded198bc9d7a8ee3ad67e594bb50c96de 100644 (file)
@@ -122,6 +122,13 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw = vir_NOP(c);
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
         c->last_thrsw = vir_NOP(c);
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
+
+        /* We need to lock the scoreboard before any tlb acess happens. If this
+         * thread switch comes after we have emitted a tlb load, then it means
+         * that we can't lock on the last thread switch any more.
+         */
+        if (c->emitted_tlb_load)
+                c->lock_scoreboard_on_first_thrsw = true;
 }
 
 static uint32_t
 }
 
 static uint32_t
@@ -1646,6 +1653,27 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
         int component = nir_intrinsic_component(instr);
         assert(component < 4);
 
         int component = nir_intrinsic_component(instr);
         assert(component < 4);
 
+        /* We need to emit our TLB reads after we have acquired the scoreboard
+         * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+         * the last thread switch to improve parallelism, however, that is only
+         * guaranteed to happen before the tlb color writes.
+         *
+         * To fix that, we make sure we always emit a thread switch before the
+         * first tlb color read. If that happens to be the last thread switch
+         * we emit, then everything is fine, but otherwsie, if any code after
+         * this point needs to emit additional thread switches, then we will
+         * switch the strategy to locking the scoreboard on the first thread
+         * switch instead -- see vir_emit_thrsw().
+         */
+        if (!c->emitted_tlb_load) {
+                if (!c->last_thrsw_at_top_level) {
+                        assert(c->devinfo->ver >= 41);
+                        vir_emit_thrsw(c);
+                }
+
+                c->emitted_tlb_load = true;
+        }
+
         struct qreg *color_reads =
                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
 
         struct qreg *color_reads =
                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
 
index 4cb37d770cf013bb3a09caad8c45c7aefe2804ff..67c7dd48d8c6fd628679bd56f03f339aa059cb9c 100644 (file)
@@ -634,6 +634,9 @@ struct v3d_compile {
         struct qinst *last_thrsw;
         bool last_thrsw_at_top_level;
 
         struct qinst *last_thrsw;
         bool last_thrsw_at_top_level;
 
+        bool emitted_tlb_load;
+        bool lock_scoreboard_on_first_thrsw;
+
         bool failed;
 };
 
         bool failed;
 };
 
@@ -700,6 +703,7 @@ struct v3d_fs_prog_data {
         bool disable_ez;
         bool uses_center_w;
         bool uses_implicit_point_line_varyings;
         bool disable_ez;
         bool uses_center_w;
         bool uses_implicit_point_line_varyings;
+        bool lock_scoreboard_on_first_thrsw;
 };
 
 struct v3d_compute_prog_data {
 };
 
 struct v3d_compute_prog_data {
index 8de582792b71533ab51ed214b09957b8265c5c4e..4f1ee605214a47699fbf77fdf783b1cf10094ccd 100644 (file)
@@ -692,6 +692,8 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
         prog_data->uses_center_w = c->uses_center_w;
         prog_data->uses_implicit_point_line_varyings =
                 c->uses_implicit_point_line_varyings;
         prog_data->uses_center_w = c->uses_center_w;
         prog_data->uses_implicit_point_line_varyings =
                 c->uses_implicit_point_line_varyings;
+        prog_data->lock_scoreboard_on_first_thrsw =
+                c->lock_scoreboard_on_first_thrsw;
 }
 
 static void
 }
 
 static void
index 0d23ac6bb2d471209c9472c48fb8f9666c87d38e..744d0c9c2119744536963dc517924e1f64202629 100644 (file)
@@ -373,6 +373,8 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                         v3d->prog.fs->prog_data.fs->uses_center_w;
 
 #if V3D_VERSION >= 40
                         v3d->prog.fs->prog_data.fs->uses_center_w;
 
 #if V3D_VERSION >= 40
+               shader.do_scoreboard_wait_on_first_thread_switch =
+                        v3d->prog.fs->prog_data.fs->lock_scoreboard_on_first_thrsw;
                shader.disable_implicit_point_line_varyings =
                         !v3d->prog.fs->prog_data.fs->uses_implicit_point_line_varyings;
 #endif
                shader.disable_implicit_point_line_varyings =
                         !v3d->prog.fs->prog_data.fs->uses_implicit_point_line_varyings;
 #endif