$(GENERATED_SOURCES) \
cik_sdma.c \
driinfo_radeonsi.h \
+ gfx10_query.c \
gfx10_shader_ngg.c \
si_blit.c \
si_buffer.c \
--- /dev/null
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stddef.h>
+
+#include "si_pipe.h"
+#include "si_query.h"
+#include "util/u_memory.h"
+#include "util/u_suballoc.h"
+#include "sid.h"
+
+/**
+ * The query buffer is written to by ESGS NGG shaders with statistics about
+ * generated and (streamout-)emitted primitives.
+ *
+ * The context maintains a ring of these query buffers, and queries simply
+ * point into the ring, allowing an arbitrary number of queries to be active
+ * without additional GPU cost.
+ */
+struct gfx10_sh_query_buffer {
+ struct list_head list;
+ struct si_resource *buf;
+ unsigned refcount;
+
+ /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+ unsigned head;
+};
+
+/* Memory layout of the query buffer. Must be kept in sync with shaders
+ * (including QBO shaders) and should be aligned to cachelines.
+ *
+ * The somewhat awkward memory layout is for compatibility with the
+ * SET_PREDICATION packet, which also means that we're setting the high bit
+ * of all those values unconditionally.
+ */
+struct gfx10_sh_query_buffer_mem {
+ struct {
+ uint64_t generated_primitives_start_dummy;
+ uint64_t emitted_primitives_start_dummy;
+ uint64_t generated_primitives;
+ uint64_t emitted_primitives;
+ } stream[4];
+ uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+ uint32_t pad[31];
+};
+
+/* Shader-based queries. */
+struct gfx10_sh_query {
+ struct si_query b;
+
+ struct gfx10_sh_query_buffer *first;
+ struct gfx10_sh_query_buffer *last;
+ unsigned first_begin;
+ unsigned last_end;
+
+ unsigned stream;
+};
+
+static void emit_shader_query(struct si_context *sctx)
+{
+ assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers));
+
+ struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
+ struct gfx10_sh_query_buffer, list);
+ qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+}
+
+static void gfx10_release_query_buffers(struct si_context *sctx,
+ struct gfx10_sh_query_buffer *first,
+ struct gfx10_sh_query_buffer *last)
+{
+ while (first) {
+ struct gfx10_sh_query_buffer *qbuf = first;
+ if (first != last)
+ first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+ else
+ first = NULL;
+
+ qbuf->refcount--;
+ if (qbuf->refcount)
+ continue;
+
+ if (qbuf->list.next == &sctx->shader_query_buffers)
+ continue; /* keep the most recent buffer; it may not be full yet */
+ if (qbuf->list.prev == &sctx->shader_query_buffers)
+ continue; /* keep the oldest buffer for recycling */
+
+ LIST_DEL(&qbuf->list);
+ si_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
+}
+
+static bool gfx10_alloc_query_buffer(struct si_context *sctx)
+{
+ if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+ return true;
+
+ struct gfx10_sh_query_buffer *qbuf = NULL;
+
+ if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+ qbuf = list_last_entry(&sctx->shader_query_buffers,
+ struct gfx10_sh_query_buffer, list);
+ if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+ goto success;
+
+ qbuf = list_first_entry(&sctx->shader_query_buffers,
+ struct gfx10_sh_query_buffer, list);
+ if (!qbuf->refcount &&
+ !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+ sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ /* Can immediately re-use the oldest buffer */
+ LIST_DEL(&qbuf->list);
+ } else {
+ qbuf = NULL;
+ }
+ }
+
+ if (!qbuf) {
+ qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+ if (unlikely(!qbuf))
+ return false;
+
+ struct si_screen *screen = sctx->screen;
+ unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
+ screen->info.min_alloc_size);
+ qbuf->buf = si_resource(
+ pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+ if (unlikely(!qbuf->buf)) {
+ FREE(qbuf);
+ return false;
+ }
+ }
+
+ /* The buffer is currently unused by the GPU. Initialize it.
+ *
+ * We need to set the high bit of all the primitive counters for
+ * compatibility with the SET_PREDICATION packet.
+ */
+ uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+ PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_UNSYNCHRONIZED);
+ assert(results);
+
+ for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
+ i < e; ++i) {
+ for (unsigned j = 0; j < 16; ++j)
+ results[32 * i + j] = (uint64_t)1 << 63;
+ results[32 * i + 16] = 0;
+ }
+
+ LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers);
+ qbuf->head = 0;
+ qbuf->refcount = sctx->num_active_shader_queries;
+
+success:;
+ struct pipe_shader_buffer sbuf;
+ sbuf.buffer = &qbuf->buf->b.b;
+ sbuf.buffer_offset = qbuf->head;
+ sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+ si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+ return true;
+}
+
+static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
+{
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+ gfx10_release_query_buffers(sctx, query->first, query->last);
+ FREE(query);
+}
+
+static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
+{
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+ gfx10_release_query_buffers(sctx, query->first, query->last);
+ query->first = query->last = NULL;
+
+ if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+ return false;
+
+ query->first = list_last_entry(&sctx->shader_query_buffers,
+ struct gfx10_sh_query_buffer, list);
+ query->first_begin = query->first->head;
+
+ sctx->num_active_shader_queries++;
+ query->first->refcount++;
+
+ return true;
+}
+
+static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
+{
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+ if (unlikely(!query->first))
+ return false; /* earlier out of memory error */
+
+ query->last = list_last_entry(&sctx->shader_query_buffers,
+ struct gfx10_sh_query_buffer, list);
+ query->last_end = query->last->head;
+
+ /* Signal the fence of the previous chunk */
+ if (query->last_end != 0) {
+ uint64_t fence_va = query->last->buf->gpu_address;
+ fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+ fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+ si_cp_release_mem(sctx, sctx->gfx_cs,
+ V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_VALUE_32BIT,
+ query->last->buf, fence_va, 0xffffffff,
+ PIPE_QUERY_GPU_FINISHED);
+ }
+
+ sctx->num_active_shader_queries--;
+
+ if (sctx->num_active_shader_queries > 0) {
+ gfx10_alloc_query_buffer(sctx);
+ } else {
+ si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+
+ /* If a query_begin is followed by a query_end without a draw
+ * in-between, we need to clear the atom to ensure that the
+ * next query_begin will re-initialize the shader buffer. */
+ si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+ }
+
+ return true;
+}
+
+static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
+ struct gfx10_sh_query_buffer_mem *qmem,
+ union pipe_query_result *result)
+{
+ static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ result->so_statistics.num_primitives_written +=
+ qmem->stream[query->stream].emitted_primitives & mask;
+ result->so_statistics.primitives_storage_needed +=
+ qmem->stream[query->stream].generated_primitives & mask;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result->b |= qmem->stream[query->stream].emitted_primitives !=
+ qmem->stream[query->stream].generated_primitives;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ result->b |= qmem->stream[query->stream].emitted_primitives !=
+ qmem->stream[query->stream].generated_primitives;
+ }
+ break;
+ default:
+ assert(0);
+ }
+}
+
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
+ bool wait, union pipe_query_result *result)
+{
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+ util_query_clear_result(result, query->b.type);
+
+ if (unlikely(!query->first))
+ return false; /* earlier out of memory error */
+ assert(query->last);
+
+ for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+ qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+ unsigned usage = PIPE_TRANSFER_READ |
+ (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ void *map;
+
+ if (rquery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+ if (!map)
+ return false;
+
+ unsigned results_begin = 0;
+ unsigned results_end = qbuf->head;
+ if (qbuf == query->first)
+ results_begin = query->first_begin;
+ if (qbuf == query->last)
+ results_end = query->last_end;
+
+ while (results_begin != results_end) {
+ struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+ results_begin += sizeof(*qmem);
+
+ gfx10_sh_query_add_result(query, qmem, result);
+ }
+
+ if (qbuf == query->first)
+ break;
+ }
+
+ return true;
+}
+
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
+ struct si_query *rquery,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset)
+{
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+ struct si_qbo_state saved_state = {};
+ struct pipe_resource *tmp_buffer = NULL;
+ unsigned tmp_buffer_offset = 0;
+
+ if (!sctx->sh_query_result_shader) {
+ sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+ if (!sctx->sh_query_result_shader)
+ return;
+ }
+
+ if (query->first != query->last) {
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
+ &tmp_buffer_offset, &tmp_buffer);
+ if (!tmp_buffer)
+ return;
+ }
+
+ si_save_qbo_state(sctx, &saved_state);
+
+ /* Pre-fill the constants configuring the shader behavior. */
+ struct {
+ uint32_t config;
+ uint32_t offset;
+ uint32_t chain;
+ uint32_t result_count;
+ } consts;
+ struct pipe_constant_buffer constant_buffer = {};
+
+ if (index >= 0) {
+ switch (query->b.type) {
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ consts.offset = sizeof(uint32_t) * query->stream;
+ consts.config = 0;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ consts.offset = sizeof(uint32_t) * (4 + query->stream);
+ consts.config = 0;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+ consts.config = 0;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ consts.offset = sizeof(uint32_t) * query->stream;
+ consts.config = 2;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ consts.offset = 0;
+ consts.config = 3;
+ break;
+ default: unreachable("bad query type");
+ }
+ } else {
+ /* Check result availability. */
+ consts.offset = 0;
+ consts.config = 1;
+ }
+
+ if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+ consts.config |= 8;
+
+ constant_buffer.buffer_size = sizeof(consts);
+ constant_buffer.user_buffer = &consts;
+
+ /* Pre-fill the SSBOs and grid. */
+ struct pipe_shader_buffer ssbo[3];
+ struct pipe_grid_info grid = {};
+
+ ssbo[1].buffer = tmp_buffer;
+ ssbo[1].buffer_offset = tmp_buffer_offset;
+ ssbo[1].buffer_size = 16;
+
+ ssbo[2] = ssbo[1];
+
+ sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+ grid.block[0] = 1;
+ grid.block[1] = 1;
+ grid.block[2] = 1;
+ grid.grid[0] = 1;
+ grid.grid[1] = 1;
+ grid.grid[2] = 1;
+
+ struct gfx10_sh_query_buffer *qbuf = query->first;
+ for (;;) {
+ unsigned begin = qbuf == query->first ? query->first_begin : 0;
+ unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+ if (!end)
+ continue;
+
+ ssbo[0].buffer = &qbuf->buf->b.b;
+ ssbo[0].buffer_offset = begin;
+ ssbo[0].buffer_size = end - begin;
+
+ consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+ consts.chain = 0;
+ if (qbuf != query->first)
+ consts.chain |= 1;
+ if (qbuf != query->last)
+ consts.chain |= 2;
+
+ if (qbuf == query->last) {
+ ssbo[2].buffer = resource;
+ ssbo[2].buffer_offset = offset;
+ ssbo[2].buffer_size = 8;
+ }
+
+ sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+ sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+ if (wait) {
+ uint64_t va;
+
+ /* Wait for result availability. Wait only for readiness
+ * of the last entry, since the fence writes should be
+ * serialized in the CP.
+ */
+ va = qbuf->buf->gpu_address;
+ va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+ va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+ si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+ }
+
+ sctx->b.launch_grid(&sctx->b, &grid);
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ if (qbuf == query->last)
+ break;
+ qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+ }
+
+ si_restore_qbo_state(sctx, &saved_state);
+ pipe_resource_reference(&tmp_buffer, NULL);
+}
+
+static const struct si_query_ops gfx10_sh_query_ops = {
+ .destroy = gfx10_sh_query_destroy,
+ .begin = gfx10_sh_query_begin,
+ .end = gfx10_sh_query_end,
+ .get_result = gfx10_sh_query_get_result,
+ .get_result_resource = gfx10_sh_query_get_result_resource,
+};
+
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+ enum pipe_query_type query_type,
+ unsigned index)
+{
+ struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+ if (unlikely(!query))
+ return NULL;
+
+ query->b.ops = &gfx10_sh_query_ops;
+ query->b.type = query_type;
+ query->stream = index;
+
+ return (struct pipe_query *)query;
+}
+
+void gfx10_init_query(struct si_context *sctx)
+{
+ LIST_INITHEAD(&sctx->shader_query_buffers);
+ sctx->atoms.s.shader_query.emit = emit_shader_query;
+}
+
+void gfx10_destroy_query(struct si_context *sctx)
+{
+ while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+ struct gfx10_sh_query_buffer *qbuf =
+ list_first_entry(&sctx->shader_query_buffers,
+ struct gfx10_sh_query_buffer, list);
+ LIST_DEL(&qbuf->list);
+
+ assert(!qbuf->refcount);
+ si_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
+}
false);
}
+static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
+{
+ LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
+ ctx->param_rw_buffers);
+
+ return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+ LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false));
+}
+
/* Send GS Alloc Req message from the first wave of the group to SPI.
* Message payload is:
* - bits 0..10: vertices in group
build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
+ /* Update query buffer */
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5030);
+ tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5031);
+ {
+ LLVMValueRef args[] = {
+ ngg_get_prim_cnt(ctx),
+ ngg_get_query_buf(ctx),
+ LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */
+ ctx->i32_0, /* soffset */
+ ctx->i32_0, /* cachepolicy */
+ };
+
+ /* TODO: should this be 64-bit atomics? */
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+ ctx->i32, args, 5, 0);
+ }
+ ac_build_endif(&ctx->ac, 5031);
+ ac_build_endif(&ctx->ac, 5030);
+
/* Export primitive data to the index buffer. Format is:
* - bits 0..8: index 0
* - bit 9: edge flag 0
tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
LLVMBuildStore(builder, tmp, primflagptr);
+ tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+ tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+ LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
lp_build_endif(&if_state);
}
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
+{
+ /* Zero out the part of LDS scratch that is used to accumulate the
+ * per-stream generated primitive count.
+ */
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+ LLVMValueRef tid = get_thread_id_in_tg(ctx);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+ ac_build_ifcc(&ctx->ac, tmp, 5090);
+ {
+ LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+ LLVMBuildStore(builder, ctx->i32_0, ptr);
+ }
+ ac_build_endif(&ctx->ac, 5090);
+
+ ac_build_s_barrier(&ctx->ac);
+}
+
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
{
const struct si_shader_selector *sel = ctx->shader->selector;
ac_build_endloop(&ctx->ac, 5100);
}
+ /* Accumulate generated primitives counts across the entire threadgroup. */
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ LLVMValueRef numprims =
+ LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+ numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64);
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5105);
+ {
+ LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+ ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
+ LLVMConstInt(ctx->i32, stream, false)),
+ numprims, LLVMAtomicOrderingMonotonic, false);
+ }
+ ac_build_endif(&ctx->ac, 5105);
+ }
+
lp_build_endif(&ctx->merged_wrap_if_state);
ac_build_s_barrier(&ctx->ac);
/* TODO: streamout */
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+ ac_build_ifcc(&ctx->ac, tmp, 5110);
+ {
+ LLVMValueRef offset;
+ tmp = tid;
+ if (sel->so.num_outputs)
+ tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), "");
+ offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), "");
+ if (sel->so.num_outputs) {
+ tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), "");
+ tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), "");
+ offset = LLVMBuildAdd(builder, offset, tmp, "");
+ }
+
+ tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+ LLVMValueRef args[] = {
+ tmp,
+ ngg_get_query_buf(ctx),
+ offset,
+ LLVMConstInt(ctx->i32, 16, false), /* soffset */
+ ctx->i32_0, /* cachepolicy */
+ };
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+ ctx->i32, args, 5, 0);
+ }
+ ac_build_endif(&ctx->ac, 5110);
+
/* TODO: culling */
/* Determine vertex liveness. */
files_libradeonsi = files(
'cik_sdma.c',
'driinfo_radeonsi.h',
+ 'gfx10_query.c',
'gfx10_shader_ngg.c',
'si_blit.c',
'si_buffer.c',
si_release_all_descriptors(sctx);
+ if (sctx->chip_class >= GFX10)
+ gfx10_destroy_query(sctx);
+
pipe_resource_reference(&sctx->esgs_ring, NULL);
pipe_resource_reference(&sctx->gsvs_ring, NULL);
pipe_resource_reference(&sctx->tess_rings, NULL);
if (sctx->query_result_shader)
sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+ if (sctx->sh_query_result_shader)
+ sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
if (sctx->gfx_cs)
sctx->ws->cs_destroy(sctx->gfx_cs);
/* Initialize graphics-only context functions. */
if (sctx->has_graphics) {
si_init_context_texture_functions(sctx);
+ if (sctx->chip_class >= GFX10)
+ gfx10_init_query(sctx);
si_init_msaa_functions(sctx);
si_init_shader_functions(sctx);
si_init_state_functions(sctx);
struct pipe_device_reset_callback device_reset_callback;
struct u_log_context *log;
void *query_result_shader;
+ void *sh_query_result_shader;
void (*emit_cache_flush)(struct si_context *ctx);
unsigned num_sdma_uploads;
unsigned max_sdma_uploads;
+ /* Shader-based queries. */
+ struct list_head shader_query_buffers;
+ unsigned num_active_shader_queries;
+
/* Statistics gathering for the DCC enablement heuristic. It can't be
* in si_texture because si_texture can be shared by multiple
* contexts. This is for back buffers only. We shouldn't get too many
void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
void *si_create_dcc_retile_cs(struct pipe_context *ctx);
void *si_create_query_result_cs(struct si_context *sctx);
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
+
+/* gfx10_query.c */
+void gfx10_init_query(struct si_context *sctx);
+void gfx10_destroy_query(struct si_context *sctx);
/* si_test_dma.c */
void si_test_dma(struct si_screen *sscreen);
#include "util/u_suballoc.h"
#include "amd/common/sid.h"
-#define SI_MAX_STREAMS 4
-
static const struct si_query_ops query_hw_ops;
struct si_hw_query_params {
if (!query)
return;
+ if (ctx->chip_class == GFX10 &&
+ (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+ assert(!"not implemented");
+ }
+
invert = ctx->render_cond_invert;
flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
query_type != SI_QUERY_TIME_ELAPSED_SDMA))
return si_query_sw_create(query_type);
+ if (sscreen->info.chip_class >= GFX10 &&
+ (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+ query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
+ query_type == PIPE_QUERY_SO_STATISTICS ||
+ query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+ return gfx10_sh_query_create(sscreen, query_type, index);
+
return si_query_hw_create(sscreen, query_type, index);
}
struct si_query_hw;
struct si_resource;
+#define SI_MAX_STREAMS 4
+
enum {
SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
SI_QUERY_DECOMPRESS_CALLS,
void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
+/* Shader-based queries */
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+ enum pipe_query_type query_type,
+ unsigned index);
+
+
/* Performance counters */
struct si_perfcounters {
unsigned num_groups;
for (unsigned i = 0; i < 4; ++i) {
ctx->gs_curprim_verts[i] =
lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
+ ctx->gs_generated_prims[i] =
+ lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
}
LLVMTypeRef a8i32 = LLVMArrayType(ctx->i32, 8);
if (ctx->type == PIPE_SHADER_TESS_CTRL ||
ctx->type == PIPE_SHADER_GEOMETRY) {
+ if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+ gfx10_ngg_gs_emit_prologue(ctx);
+ nested_barrier = false;
+ } else {
+ nested_barrier = true;
+ }
+
/* Number of patches / primitives */
num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
- nested_barrier = true;
} else {
/* Number of vertices */
num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
LLVMValueRef gs_next_vertex[4];
LLVMValueRef gs_curprim_verts[4];
+ LLVMValueRef gs_generated_prims[4];
LLVMValueRef gs_ngg_emit;
LLVMValueRef gs_ngg_scratch;
LLVMValueRef postponed_kill;
void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
unsigned stream,
LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
return ctx->create_compute_state(ctx, &state);
}
+
+/* Create the compute shader that is used to collect the results of gfx10+
+ * shader queries.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ *
+ * CONST
+ * 0.x = config; the low 3 bits indicate the mode:
+ * 0: sum up counts
+ * 1: determine result availability and write it as a boolean
+ * 2: SO_OVERFLOW
+ * 3: SO_ANY_OVERFLOW
+ * the remaining bits form a bitfield:
+ * 8: write result as a 64-bit value
+ * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
+ * 0.z = chain bit field:
+ * 1: have previous summary buffer
+ * 2: write next summary buffer
+ * 0.w = result_count
+ */
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
+{
+ /* TEMP[0].x = accumulated result so far
+ * TEMP[0].y = result missing
+ * TEMP[0].z = whether we're in overflow mode
+ */
+ static const char text_tmpl[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL BUFFER[0]\n"
+ "DCL BUFFER[1]\n"
+ "DCL BUFFER[2]\n"
+ "DCL CONST[0][0..0]\n"
+ "DCL TEMP[0..5]\n"
+ "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+ "IMM[1] UINT32 {1, 2, 4, 8}\n"
+ "IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+ /*
+ acc_result = 0;
+ acc_missing = 0;
+ if (chain & 1) {
+ acc_result = buffer[1][0];
+ acc_missing = buffer[1][1];
+ }
+ */
+ "MOV TEMP[0].xy, IMM[0].xxxx\n"
+ "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+ "UIF TEMP[5]\n"
+ "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ /*
+ is_overflow (TEMP[0].z) = (config & 7) >= 2;
+ result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
+ base_offset (TEMP[1].y) = 0;
+ for (;;) {
+ if (!result_remaining)
+ break;
+ result_remaining--;
+ */
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+ "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+ "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+
+ "BGNLOOP\n"
+ "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+ /*
+ fence = buffer[0]@(base_offset + 32);
+ if (!fence) {
+ acc_missing = ~0u;
+ break;
+ }
+ */
+ "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+ "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "MOV TEMP[0].y, TEMP[5].xxxx\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /*
+ stream_offset (TEMP[2].x) = base_offset + offset;
+
+ if (!(config & 7)) {
+ acc_result += buffer[0]@stream_offset;
+ }
+ */
+ "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+ "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+ "ENDIF\n"
+
+ /*
+ if ((config & 7) >= 2) {
+ count (TEMP[2].y) = (config & 1) ? 4 : 1;
+ */
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+ "UIF TEMP[5]\n"
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+ "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+ /*
+ do {
+ generated = buffer[0]@stream_offset;
+ emitted = buffer[0]@(stream_offset + 16);
+ if (generated != emitted) {
+ acc_result = 1;
+ result_remaining = 0;
+ break;
+ }
+
+ stream_offset += 4;
+ } while (--count);
+ */
+ "BGNLOOP\n"
+ "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+ "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+ "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+ "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+ "UIF TEMP[5]\n"
+ "MOV TEMP[0].x, IMM[1].xxxx\n"
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+ "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+ "ENDLOOP\n"
+ "ENDIF\n"
+
+ /*
+ base_offset += 64;
+ } // end outer loop
+ */
+ "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+ "ENDLOOP\n"
+
+ /*
+ if (chain & 2) {
+ buffer[2][0] = acc_result;
+ buffer[2][1] = acc_missing;
+ } else {
+ */
+ "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+ "UIF TEMP[5]\n"
+ "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+ "ELSE\n"
+
+ /*
+ if ((config & 7) == 1) {
+ acc_result = acc_missing ? 0 : 1;
+ acc_missing = 0;
+ }
+ */
+ "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+ "UIF TEMP[5]\n"
+ "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+ "MOV TEMP[0].y, IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ /*
+ if (!acc_missing) {
+ buffer[2][0] = acc_result;
+ if (config & 8)
+ buffer[2][1] = 0;
+ }
+ */
+ "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+ "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+ "UIF TEMP[5]\n"
+ "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {};
+
+ if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return sctx->b.create_compute_state(&sctx->b, &state);
+}
struct si_atom spi_map;
struct si_atom scratch_state;
struct si_atom window_rectangles;
+ struct si_atom shader_query;
} s;
struct si_atom array[0];
};
SI_PS_IMAGE_COLORBUF0_FMASK,
SI_PS_IMAGE_COLORBUF0_FMASK_HI,
+ GFX10_GS_QUERY_BUF,
+
SI_NUM_RW_BUFFERS,
};