v3d: Add SSBO/atomic counters support.

author Eric Anholt <eric@anholt.net>

Mon, 11 Dec 2017 01:11:25 +0000 (17:11 -0800)

committer Eric Anholt <eric@anholt.net>

Mon, 14 Jan 2019 23:40:55 +0000 (15:40 -0800)
author Eric Anholt <eric@anholt.net>
Mon, 11 Dec 2017 01:11:25 +0000 (17:11 -0800)
committer Eric Anholt <eric@anholt.net>
Mon, 14 Jan 2019 23:40:55 +0000 (15:40 -0800)
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c

index 2e7b1e8e8a2c41671a39dba7a87c49be1e9bed79..b8e39f357f71db279420544f73d0731354e07453 100644 (file)
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -107,16 +107,89 @@ vir_emit_thrsw(struct v3d_compile *c)
          c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
  }
  
+static uint32_t
+v3d_general_tmu_op(nir_intrinsic_instr *instr)
+{
+        switch (instr->intrinsic) {
+        case nir_intrinsic_load_ssbo:
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_uniform:
+                return GENERAL_TMU_READ_OP_READ;
+        case nir_intrinsic_store_ssbo:
+                return GENERAL_TMU_WRITE_OP_WRITE;
+        case nir_intrinsic_ssbo_atomic_add:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
+        case nir_intrinsic_ssbo_atomic_imin:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
+        case nir_intrinsic_ssbo_atomic_umin:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
+        case nir_intrinsic_ssbo_atomic_imax:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
+        case nir_intrinsic_ssbo_atomic_umax:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
+        case nir_intrinsic_ssbo_atomic_and:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
+        case nir_intrinsic_ssbo_atomic_or:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
+        case nir_intrinsic_ssbo_atomic_xor:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
+        case nir_intrinsic_ssbo_atomic_exchange:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
+        case nir_intrinsic_ssbo_atomic_comp_swap:
+                return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
+        default:
+                unreachable("unknown intrinsic op");
+        }
+}
+
  /**
- * Implements indirect uniform loads through the TMU general memory access
- * interface.
+ * Implements indirect uniform loads and SSBO accesses through the TMU general
+ * memory access interface.
   */
  static void
  ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
  {
-        uint32_t tmu_op = GENERAL_TMU_READ_OP_READ;
-        bool has_index = instr->intrinsic == nir_intrinsic_load_ubo;
-        int offset_src = 0 + has_index;
+        /* XXX perf: We should turn add/sub of 1 to inc/dec.  Perhaps NIR
+         * wants to have support for inc/dec?
+         */
+
+        uint32_t tmu_op = v3d_general_tmu_op(instr);
+        bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo;
+
+        int offset_src;
+        int tmu_writes = 1; /* address */
+        if (instr->intrinsic == nir_intrinsic_load_uniform) {
+                offset_src = 0;
+        } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
+                   instr->intrinsic == nir_intrinsic_load_ubo) {
+                offset_src = 1;
+        } else if (is_store) {
+                offset_src = 2;
+                for (int i = 0; i < instr->num_components; i++) {
+                        vir_MOV_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     ntq_get_src(c, instr->src[0], i));
+                        tmu_writes++;
+                }
+        } else {
+                offset_src = 1;
+                vir_MOV_dest(c,
+                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                             ntq_get_src(c, instr->src[2], 0));
+                tmu_writes++;
+                if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
+                        vir_MOV_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+                                     ntq_get_src(c, instr->src[3], 0));
+                        tmu_writes++;
+                }
+        }
+
+        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
+         * storing at the same time.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
  
          struct qreg offset;
          if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -149,12 +222,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
  
                  if (base != 0)
                          offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
-        } else {
+        } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
                  /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
                   * 1 (0 is gallium's constant buffer 0).
                   */
                  offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
                                       nir_src_as_uint(instr->src[0]) + 1);
+        } else {
+                offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+                                     nir_src_as_uint(instr->src[is_store ?
+                                                                1 : 0]));
          }
  
          uint32_t config = (0xffffff00 |
@@ -167,6 +244,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
                             instr->num_components - 2);
          }
  
+        if (c->execute.file != QFILE_NULL)
+                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+
          struct qreg dest;
          if (config == ~0)
                  dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
@@ -188,10 +268,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
                          vir_uniform_ui(c, config);
          }
  
+        if (c->execute.file != QFILE_NULL)
+                vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
          vir_emit_thrsw(c);
  
+        /* Read the result, or wait for the TMU op to complete. */
          for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
                  ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+
+        if (nir_intrinsic_dest_components(instr) == 0)
+                vir_TMUWT(c);
  }
  
  static struct qreg *
@@ -1549,6 +1636,9 @@ ntq_setup_uniforms(struct v3d_compile *c)
                                                                   false);
                  unsigned vec4_size = 4 * sizeof(float);
  
+                if (var->data.mode != nir_var_uniform)
+                        continue;
+
                  declare_uniform_range(c, var->data.driver_location * vec4_size,
                                        vec4_count * vec4_size);
  
@@ -1629,6 +1719,27 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  ntq_emit_tmu_general(c, instr);
                  break;
  
+        case nir_intrinsic_ssbo_atomic_add:
+        case nir_intrinsic_ssbo_atomic_imin:
+        case nir_intrinsic_ssbo_atomic_umin:
+        case nir_intrinsic_ssbo_atomic_imax:
+        case nir_intrinsic_ssbo_atomic_umax:
+        case nir_intrinsic_ssbo_atomic_and:
+        case nir_intrinsic_ssbo_atomic_or:
+        case nir_intrinsic_ssbo_atomic_xor:
+        case nir_intrinsic_ssbo_atomic_exchange:
+        case nir_intrinsic_ssbo_atomic_comp_swap:
+        case nir_intrinsic_load_ssbo:
+        case nir_intrinsic_store_ssbo:
+                ntq_emit_tmu_general(c, instr);
+                break;
+
+        case nir_intrinsic_get_buffer_size:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE,
+                                           nir_src_as_uint(instr->src[0])));
+                break;
+
          case nir_intrinsic_load_user_clip_plane:
                  for (int i = 0; i < instr->num_components; i++) {
                          ntq_store_dest(c, &instr->dest, i,
@@ -1732,6 +1843,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  break;
          }
  
+        case nir_intrinsic_memory_barrier:
+        case nir_intrinsic_memory_barrier_atomic_counter:
+        case nir_intrinsic_memory_barrier_buffer:
+                /* We don't do any instruction scheduling of these NIR
+                 * instructions between each other, so we just need to make
+                 * sure that the TMU operations before the barrier are flushed
+                 * before the ones after the barrier.  That is currently
+                 * handled by having a THRSW in each of them and a LDTMU
+                 * series or a TMUWT after.
+                 */
+                break;
+
          default:
                  fprintf(stderr, "Unknown intrinsic: ");
                  nir_print_instr(&instr->instr, stderr);
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h

index 7068b9029be62c6664be2443ed478d529485eee1..a35a46c3316bebffd72b06849d63371a3b44d412 100644 (file)
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -243,6 +243,12 @@ enum quniform_contents {
          QUNIFORM_TEXRECT_SCALE_X,
          QUNIFORM_TEXRECT_SCALE_Y,
  
+        /* Returns the base offset of the SSBO given by the data value. */
+        QUNIFORM_SSBO_OFFSET,
+
+        /* Returns the size of the SSBO given by the data value. */
+        QUNIFORM_GET_BUFFER_SIZE,
+
          QUNIFORM_ALPHA_REF,
  
          /**
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c

index 56960e6d7ab6f51e0cf6584b9abe77a94d221699..5bef6c6a42d115212fe52b91c9f7587e3bcbfb3a 100644 (file)
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -81,6 +81,14 @@ vir_dump_uniform(enum quniform_contents contents,
                  fprintf(stderr, "ubo[%d]", data);
                  break;
  
+        case QUNIFORM_SSBO_OFFSET:
+                fprintf(stderr, "ssbo[%d]", data);
+                break;
+
+        case QUNIFORM_GET_BUFFER_SIZE:
+                fprintf(stderr, "ssbo_size[%d]", data);
+                break;
+
          default:
                  if (quniform_contents_is_texture_p0(contents)) {
                          fprintf(stderr, "tex[%d].p0: 0x%08x",
diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c

index b9eaf7e67ec3fa3557be818b78649cef00a9ace6..104096d5248dd394190d2ce5f29d5b86420c24ae 100644 (file)
--- a/src/gallium/drivers/v3d/v3d_context.c
+++ b/src/gallium/drivers/v3d/v3d_context.c
@@ -65,6 +65,16 @@ v3d_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
          }
  }
  
+static void
+v3d_memory_barrier(struct pipe_context *pctx, unsigned int flags)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+
+        /* We only need to flush jobs writing to SSBOs/images. */
+        perf_debug("Flushing all jobs for glMemoryBarrier(), could do better");
+        v3d_flush(pctx);
+}
+
  static void
  v3d_set_debug_callback(struct pipe_context *pctx,
                         const struct pipe_debug_callback *cb)
@@ -172,6 +182,7 @@ v3d_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
          pctx->priv = priv;
          pctx->destroy = v3d_context_destroy;
          pctx->flush = v3d_pipe_flush;
+        pctx->memory_barrier = v3d_memory_barrier;
          pctx->set_debug_callback = v3d_set_debug_callback;
          pctx->invalidate_resource = v3d_invalidate_resource;
          pctx->get_sample_position = v3d_get_sample_position;
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h

index 25be07c243763f40e21b8afecbf8b41c641197ae..686ecfa80240a487cd30bdfc2a2568e87e8ce94e 100644 (file)
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -82,6 +82,7 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
  #define VC5_DIRTY_OQ            (1 << 28)
  #define VC5_DIRTY_CENTROID_FLAGS (1 << 29)
  #define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1 << 30)
+#define VC5_DIRTY_SSBO          (1 << 31)
  
  #define VC5_MAX_FS_INPUTS 64
  
@@ -203,6 +204,11 @@ struct v3d_streamout_stateobj {
          unsigned num_targets;
  };
  
+struct v3d_ssbo_stateobj {
+        struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS];
+        uint32_t enabled_mask;
+};
+
  /* Hash table key for v3d->jobs */
  struct v3d_job_key {
          struct pipe_surface *cbufs[4];
@@ -433,6 +439,7 @@ struct v3d_context {
          struct pipe_poly_stipple stipple;
          struct pipe_clip_state clip;
          struct pipe_viewport_state viewport;
+        struct v3d_ssbo_stateobj ssbo[PIPE_SHADER_TYPES];
          struct v3d_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
          struct v3d_texture_stateobj tex[PIPE_SHADER_TYPES];
          struct v3d_vertexbuf_stateobj vertexbuf;
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c

index 5d949514d09217b80477f118592c6787a0b9ea64..dd5eda87e8f82bd62c250eaf922b43f93efbfe9d 100644 (file)
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -299,8 +299,11 @@ v3d_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
          case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
          case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
          case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
-        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
                  return VC5_MAX_TEXTURE_SAMPLERS;
+
+        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+                return PIPE_MAX_SHADER_BUFFERS;
+
          case PIPE_SHADER_CAP_PREFERRED_IR:
                  return PIPE_SHADER_IR_NIR;
          case PIPE_SHADER_CAP_SUPPORTED_IRS:
diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c

index bc680af0956284f2b0519c7c2207427405d3d242..5d907726349ce16fe2884dd991f416782e781b74 100644 (file)
--- a/src/gallium/drivers/v3d/v3d_uniforms.c
+++ b/src/gallium/drivers/v3d/v3d_uniforms.c
@@ -276,6 +276,21 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader,
                          }
                          break;
  
+                case QUNIFORM_SSBO_OFFSET: {
+                        struct pipe_shader_buffer *sb =
+                                &v3d->ssbo[stage].sb[data];
+
+                        cl_aligned_reloc(&job->indirect, &uniforms,
+                                         v3d_resource(sb->buffer)->bo,
+                                         sb->buffer_offset);
+                        break;
+                }
+
+                case QUNIFORM_GET_BUFFER_SIZE:
+                        cl_aligned_u32(&uniforms,
+                                       v3d->ssbo[stage].sb[data].buffer_size);
+                        break;
+
                  case QUNIFORM_TEXTURE_FIRST_LEVEL:
                          cl_aligned_f(&uniforms,
                                       texstate->textures[data]->u.tex.first_level);
@@ -362,6 +377,11 @@ v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader)
                          dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX;
                          break;
  
+                case QUNIFORM_SSBO_OFFSET:
+                case QUNIFORM_GET_BUFFER_SIZE:
+                        dirty |= VC5_DIRTY_SSBO;
+                        break;
+
                  case QUNIFORM_ALPHA_REF:
                          dirty |= VC5_DIRTY_ZSA;
                          break;
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c

index 46e629d0c64f9639708887f02dbb93748ea4676c..7f111bbe75f4972381c8828546af99d7e410de95 100644 (file)
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -478,6 +478,17 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                  job->submit.in_sync_bcl = v3d->out_sync;
          }
  
+        /* Mark SSBOs as being written.  We don't actually know which ones are
+         * read vs written, so just assume the worst
+         */
+        for (int s = 0; s < PIPE_SHADER_TYPES; s++) {
+                foreach_bit(i, v3d->ssbo[s].enabled_mask) {
+                        v3d_job_add_write_resource(job,
+                                                   v3d->ssbo[s].sb[i].buffer);
+                        job->tmu_dirty_rcl = true;
+                }
+        }
+
          /* Get space to emit our draw call into the BCL, using a branch to
           * jump to a new BO if necessary.
           */
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c

index 95a2dc30c8b1cc9983e5572bbb8cd1ecf26098f3..7ea5dabbe1f93ae4cb17dfdb335e6ae5ed2daf7b 100644 (file)
--- a/src/gallium/drivers/v3d/v3dx_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -986,6 +986,53 @@ v3d_set_stream_output_targets(struct pipe_context *pctx,
          ctx->dirty |= VC5_DIRTY_STREAMOUT;
  }
  
+static void
+v3d_set_shader_buffers(struct pipe_context *pctx,
+                       enum pipe_shader_type shader,
+                       unsigned start, unsigned count,
+                       const struct pipe_shader_buffer *buffers)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_ssbo_stateobj *so = &v3d->ssbo[shader];
+        unsigned mask = 0;
+
+        if (buffers) {
+                for (unsigned i = 0; i < count; i++) {
+                        unsigned n = i + start;
+                        struct pipe_shader_buffer *buf = &so->sb[n];
+
+                        if ((buf->buffer == buffers[i].buffer) &&
+                            (buf->buffer_offset == buffers[i].buffer_offset) &&
+                            (buf->buffer_size == buffers[i].buffer_size))
+                                continue;
+
+                        mask |= 1 << n;
+
+                        buf->buffer_offset = buffers[i].buffer_offset;
+                        buf->buffer_size = buffers[i].buffer_size;
+                        pipe_resource_reference(&buf->buffer, buffers[i].buffer);
+
+                        if (buf->buffer)
+                                so->enabled_mask |= 1 << n;
+                        else
+                                so->enabled_mask &= ~(1 << n);
+                }
+        } else {
+                mask = ((1 << count) - 1) << start;
+
+                for (unsigned i = 0; i < count; i++) {
+                        unsigned n = i + start;
+                        struct pipe_shader_buffer *buf = &so->sb[n];
+
+                        pipe_resource_reference(&buf->buffer, NULL);
+                }
+
+                so->enabled_mask &= ~mask;
+        }
+
+        v3d->dirty |= VC5_DIRTY_SSBO;
+}
+
  void
  v3dX(state_init)(struct pipe_context *pctx)
  {
@@ -1025,6 +1072,8 @@ v3dX(state_init)(struct pipe_context *pctx)
          pctx->sampler_view_destroy = v3d_sampler_view_destroy;
          pctx->set_sampler_views = v3d_set_sampler_views;
  
+        pctx->set_shader_buffers = v3d_set_shader_buffers;
+
          pctx->create_stream_output_target = v3d_create_stream_output_target;
          pctx->stream_output_target_destroy = v3d_stream_output_target_destroy;
          pctx->set_stream_output_targets = v3d_set_stream_output_targets;
author	Eric Anholt <eric@anholt.net>
	Mon, 11 Dec 2017 01:11:25 +0000 (17:11 -0800)
committer	Eric Anholt <eric@anholt.net>
	Mon, 14 Jan 2019 23:40:55 +0000 (15:40 -0800)
src/broadcom/compiler/nir_to_vir.c		patch \| blob \| history
src/broadcom/compiler/v3d_compiler.h		patch \| blob \| history
src/broadcom/compiler/vir_dump.c		patch \| blob \| history
src/gallium/drivers/v3d/v3d_context.c		patch \| blob \| history
src/gallium/drivers/v3d/v3d_context.h		patch \| blob \| history
src/gallium/drivers/v3d/v3d_screen.c		patch \| blob \| history
src/gallium/drivers/v3d/v3d_uniforms.c		patch \| blob \| history
src/gallium/drivers/v3d/v3dx_draw.c		patch \| blob \| history
src/gallium/drivers/v3d/v3dx_state.c		patch \| blob \| history