From 5932c2f0b9b56e6eeee87baa7b0b493227850f69 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sun, 10 Dec 2017 17:11:25 -0800 Subject: [PATCH] v3d: Add SSBO/atomic counters support. So far I assume that all the buffers get written. If they weren't, you'd probably be using UBOs instead. --- src/broadcom/compiler/nir_to_vir.c | 135 +++++++++++++++++++++++-- src/broadcom/compiler/v3d_compiler.h | 6 ++ src/broadcom/compiler/vir_dump.c | 8 ++ src/gallium/drivers/v3d/v3d_context.c | 11 ++ src/gallium/drivers/v3d/v3d_context.h | 7 ++ src/gallium/drivers/v3d/v3d_screen.c | 5 +- src/gallium/drivers/v3d/v3d_uniforms.c | 20 ++++ src/gallium/drivers/v3d/v3dx_draw.c | 11 ++ src/gallium/drivers/v3d/v3dx_state.c | 49 +++++++++ 9 files changed, 245 insertions(+), 7 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 2e7b1e8e8a2..b8e39f357f7 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -107,16 +107,89 @@ vir_emit_thrsw(struct v3d_compile *c) c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); } +static uint32_t +v3d_general_tmu_op(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_uniform: + return GENERAL_TMU_READ_OP_READ; + case nir_intrinsic_store_ssbo: + return GENERAL_TMU_WRITE_OP_WRITE; + case nir_intrinsic_ssbo_atomic_add: + return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP; + case nir_intrinsic_ssbo_atomic_imin: + return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN; + case nir_intrinsic_ssbo_atomic_umin: + return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN; + case nir_intrinsic_ssbo_atomic_imax: + return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX; + case nir_intrinsic_ssbo_atomic_umax: + return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX; + case nir_intrinsic_ssbo_atomic_and: + return GENERAL_TMU_WRITE_OP_ATOMIC_AND; + case nir_intrinsic_ssbo_atomic_or: + return GENERAL_TMU_WRITE_OP_ATOMIC_OR; + case nir_intrinsic_ssbo_atomic_xor: + return GENERAL_TMU_WRITE_OP_ATOMIC_XOR; + case nir_intrinsic_ssbo_atomic_exchange: + return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG; + case nir_intrinsic_ssbo_atomic_comp_swap: + return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG; + default: + unreachable("unknown intrinsic op"); + } +} + /** - * Implements indirect uniform loads through the TMU general memory access - * interface. + * Implements indirect uniform loads and SSBO accesses through the TMU general + * memory access interface. */ static void ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) { - uint32_t tmu_op = GENERAL_TMU_READ_OP_READ; - bool has_index = instr->intrinsic == nir_intrinsic_load_ubo; - int offset_src = 0 + has_index; + /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR + * wants to have support for inc/dec? + */ + + uint32_t tmu_op = v3d_general_tmu_op(instr); + bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo; + + int offset_src; + int tmu_writes = 1; /* address */ + if (instr->intrinsic == nir_intrinsic_load_uniform) { + offset_src = 0; + } else if (instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_ubo) { + offset_src = 1; + } else if (is_store) { + offset_src = 2; + for (int i = 0; i < instr->num_components; i++) { + vir_MOV_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), + ntq_get_src(c, instr->src[0], i)); + tmu_writes++; + } + } else { + offset_src = 1; + vir_MOV_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), + ntq_get_src(c, instr->src[2], 0)); + tmu_writes++; + if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) { + vir_MOV_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), + ntq_get_src(c, instr->src[3], 0)); + tmu_writes++; + } + } + + /* Make sure we won't exceed the 16-entry TMU fifo if each thread is + * storing at the same time. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; struct qreg offset; if (instr->intrinsic == nir_intrinsic_load_uniform) { @@ -149,12 +222,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) if (base != 0) offset = vir_ADD(c, offset, vir_uniform_ui(c, base)); - } else { + } else if (instr->intrinsic == nir_intrinsic_load_ubo) { /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by * 1 (0 is gallium's constant buffer 0). */ offset = vir_uniform(c, QUNIFORM_UBO_ADDR, nir_src_as_uint(instr->src[0]) + 1); + } else { + offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, + nir_src_as_uint(instr->src[is_store ? + 1 : 0])); } uint32_t config = (0xffffff00 | @@ -167,6 +244,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) instr->num_components - 2); } + if (c->execute.file != QFILE_NULL) + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + struct qreg dest; if (config == ~0) dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); @@ -188,10 +268,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_uniform_ui(c, config); } + if (c->execute.file != QFILE_NULL) + vir_set_cond(tmu, V3D_QPU_COND_IFA); + vir_emit_thrsw(c); + /* Read the result, or wait for the TMU op to complete. */ for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); + + if (nir_intrinsic_dest_components(instr) == 0) + vir_TMUWT(c); } static struct qreg * @@ -1549,6 +1636,9 @@ ntq_setup_uniforms(struct v3d_compile *c) false); unsigned vec4_size = 4 * sizeof(float); + if (var->data.mode != nir_var_uniform) + continue; + declare_uniform_range(c, var->data.driver_location * vec4_size, vec4_count * vec4_size); @@ -1629,6 +1719,27 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_tmu_general(c, instr); break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + ntq_emit_tmu_general(c, instr); + break; + + case nir_intrinsic_get_buffer_size: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE, + nir_src_as_uint(instr->src[0]))); + break; + case nir_intrinsic_load_user_clip_plane: for (int i = 0; i < instr->num_components; i++) { ntq_store_dest(c, &instr->dest, i, @@ -1732,6 +1843,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; } + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + /* We don't do any instruction scheduling of these NIR + * instructions between each other, so we just need to make + * sure that the TMU operations before the barrier are flushed + * before the ones after the barrier. That is currently + * handled by having a THRSW in each of them and a LDTMU + * series or a TMUWT after. + */ + break; + default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 7068b9029be..a35a46c3316 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -243,6 +243,12 @@ enum quniform_contents { QUNIFORM_TEXRECT_SCALE_X, QUNIFORM_TEXRECT_SCALE_Y, + /* Returns the base offset of the SSBO given by the data value. */ + QUNIFORM_SSBO_OFFSET, + + /* Returns the size of the SSBO given by the data value. */ + QUNIFORM_GET_BUFFER_SIZE, + QUNIFORM_ALPHA_REF, /** diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c index 56960e6d7ab..5bef6c6a42d 100644 --- a/src/broadcom/compiler/vir_dump.c +++ b/src/broadcom/compiler/vir_dump.c @@ -81,6 +81,14 @@ vir_dump_uniform(enum quniform_contents contents, fprintf(stderr, "ubo[%d]", data); break; + case QUNIFORM_SSBO_OFFSET: + fprintf(stderr, "ssbo[%d]", data); + break; + + case QUNIFORM_GET_BUFFER_SIZE: + fprintf(stderr, "ssbo_size[%d]", data); + break; + default: if (quniform_contents_is_texture_p0(contents)) { fprintf(stderr, "tex[%d].p0: 0x%08x", diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c index b9eaf7e67ec..104096d5248 100644 --- a/src/gallium/drivers/v3d/v3d_context.c +++ b/src/gallium/drivers/v3d/v3d_context.c @@ -65,6 +65,16 @@ v3d_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, } } +static void +v3d_memory_barrier(struct pipe_context *pctx, unsigned int flags) +{ + struct v3d_context *v3d = v3d_context(pctx); + + /* We only need to flush jobs writing to SSBOs/images. */ + perf_debug("Flushing all jobs for glMemoryBarrier(), could do better"); + v3d_flush(pctx); +} + static void v3d_set_debug_callback(struct pipe_context *pctx, const struct pipe_debug_callback *cb) @@ -172,6 +182,7 @@ v3d_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) pctx->priv = priv; pctx->destroy = v3d_context_destroy; pctx->flush = v3d_pipe_flush; + pctx->memory_barrier = v3d_memory_barrier; pctx->set_debug_callback = v3d_set_debug_callback; pctx->invalidate_resource = v3d_invalidate_resource; pctx->get_sample_position = v3d_get_sample_position; diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h index 25be07c2437..686ecfa8024 100644 --- a/src/gallium/drivers/v3d/v3d_context.h +++ b/src/gallium/drivers/v3d/v3d_context.h @@ -82,6 +82,7 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo); #define VC5_DIRTY_OQ (1 << 28) #define VC5_DIRTY_CENTROID_FLAGS (1 << 29) #define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1 << 30) +#define VC5_DIRTY_SSBO (1 << 31) #define VC5_MAX_FS_INPUTS 64 @@ -203,6 +204,11 @@ struct v3d_streamout_stateobj { unsigned num_targets; }; +struct v3d_ssbo_stateobj { + struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS]; + uint32_t enabled_mask; +}; + /* Hash table key for v3d->jobs */ struct v3d_job_key { struct pipe_surface *cbufs[4]; @@ -433,6 +439,7 @@ struct v3d_context { struct pipe_poly_stipple stipple; struct pipe_clip_state clip; struct pipe_viewport_state viewport; + struct v3d_ssbo_stateobj ssbo[PIPE_SHADER_TYPES]; struct v3d_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; struct v3d_texture_stateobj tex[PIPE_SHADER_TYPES]; struct v3d_vertexbuf_stateobj vertexbuf; diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c index 5d949514d09..dd5eda87e8f 100644 --- a/src/gallium/drivers/v3d/v3d_screen.c +++ b/src/gallium/drivers/v3d/v3d_screen.c @@ -299,8 +299,11 @@ v3d_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return VC5_MAX_TEXTURE_SAMPLERS; + + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return PIPE_MAX_SHADER_BUFFERS; + case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_NIR; case PIPE_SHADER_CAP_SUPPORTED_IRS: diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c index bc680af0956..5d907726349 100644 --- a/src/gallium/drivers/v3d/v3d_uniforms.c +++ b/src/gallium/drivers/v3d/v3d_uniforms.c @@ -276,6 +276,21 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader, } break; + case QUNIFORM_SSBO_OFFSET: { + struct pipe_shader_buffer *sb = + &v3d->ssbo[stage].sb[data]; + + cl_aligned_reloc(&job->indirect, &uniforms, + v3d_resource(sb->buffer)->bo, + sb->buffer_offset); + break; + } + + case QUNIFORM_GET_BUFFER_SIZE: + cl_aligned_u32(&uniforms, + v3d->ssbo[stage].sb[data].buffer_size); + break; + case QUNIFORM_TEXTURE_FIRST_LEVEL: cl_aligned_f(&uniforms, texstate->textures[data]->u.tex.first_level); @@ -362,6 +377,11 @@ v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader) dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX; break; + case QUNIFORM_SSBO_OFFSET: + case QUNIFORM_GET_BUFFER_SIZE: + dirty |= VC5_DIRTY_SSBO; + break; + case QUNIFORM_ALPHA_REF: dirty |= VC5_DIRTY_ZSA; break; diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 46e629d0c64..7f111bbe75f 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -478,6 +478,17 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) job->submit.in_sync_bcl = v3d->out_sync; } + /* Mark SSBOs as being written. We don't actually know which ones are + * read vs written, so just assume the worst + */ + for (int s = 0; s < PIPE_SHADER_TYPES; s++) { + foreach_bit(i, v3d->ssbo[s].enabled_mask) { + v3d_job_add_write_resource(job, + v3d->ssbo[s].sb[i].buffer); + job->tmu_dirty_rcl = true; + } + } + /* Get space to emit our draw call into the BCL, using a branch to * jump to a new BO if necessary. */ diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c index 95a2dc30c8b..7ea5dabbe1f 100644 --- a/src/gallium/drivers/v3d/v3dx_state.c +++ b/src/gallium/drivers/v3d/v3dx_state.c @@ -986,6 +986,53 @@ v3d_set_stream_output_targets(struct pipe_context *pctx, ctx->dirty |= VC5_DIRTY_STREAMOUT; } +static void +v3d_set_shader_buffers(struct pipe_context *pctx, + enum pipe_shader_type shader, + unsigned start, unsigned count, + const struct pipe_shader_buffer *buffers) +{ + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_ssbo_stateobj *so = &v3d->ssbo[shader]; + unsigned mask = 0; + + if (buffers) { + for (unsigned i = 0; i < count; i++) { + unsigned n = i + start; + struct pipe_shader_buffer *buf = &so->sb[n]; + + if ((buf->buffer == buffers[i].buffer) && + (buf->buffer_offset == buffers[i].buffer_offset) && + (buf->buffer_size == buffers[i].buffer_size)) + continue; + + mask |= 1 << n; + + buf->buffer_offset = buffers[i].buffer_offset; + buf->buffer_size = buffers[i].buffer_size; + pipe_resource_reference(&buf->buffer, buffers[i].buffer); + + if (buf->buffer) + so->enabled_mask |= 1 << n; + else + so->enabled_mask &= ~(1 << n); + } + } else { + mask = ((1 << count) - 1) << start; + + for (unsigned i = 0; i < count; i++) { + unsigned n = i + start; + struct pipe_shader_buffer *buf = &so->sb[n]; + + pipe_resource_reference(&buf->buffer, NULL); + } + + so->enabled_mask &= ~mask; + } + + v3d->dirty |= VC5_DIRTY_SSBO; +} + void v3dX(state_init)(struct pipe_context *pctx) { @@ -1025,6 +1072,8 @@ v3dX(state_init)(struct pipe_context *pctx) pctx->sampler_view_destroy = v3d_sampler_view_destroy; pctx->set_sampler_views = v3d_set_sampler_views; + pctx->set_shader_buffers = v3d_set_shader_buffers; + pctx->create_stream_output_target = v3d_create_stream_output_target; pctx->stream_output_target_destroy = v3d_stream_output_target_destroy; pctx->set_stream_output_targets = v3d_set_stream_output_targets; -- 2.30.2