radeonsi/gfx10: implement streamout-related queries

author Nicolai Hähnle <nicolai.haehnle@amd.com>

Wed, 19 Sep 2018 12:53:35 +0000 (14:53 +0200)

committer Marek Olšák <marek.olsak@amd.com>

Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
author Nicolai Hähnle <nicolai.haehnle@amd.com>
Wed, 19 Sep 2018 12:53:35 +0000 (14:53 +0200)
committer Marek Olšák <marek.olsak@amd.com>
Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources

index 83cca397716295b6ff6358685ea0834b1d9b2e2c..f25309736c9d516977c308817ef0b40c033a8391 100644 (file)
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -6,6 +6,7 @@ C_SOURCES := \
         $(GENERATED_SOURCES) \
         cik_sdma.c \
         driinfo_radeonsi.h \
+       gfx10_query.c \
         gfx10_shader_ngg.c \
         si_blit.c \
         si_buffer.c \
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c

new file mode 100644 (file)

index 0000000..8584b2a
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stddef.h>
+
+#include "si_pipe.h"
+#include "si_query.h"
+#include "util/u_memory.h"
+#include "util/u_suballoc.h"
+#include "sid.h"
+
+/**
+ * The query buffer is written to by ESGS NGG shaders with statistics about
+ * generated and (streamout-)emitted primitives.
+ *
+ * The context maintains a ring of these query buffers, and queries simply
+ * point into the ring, allowing an arbitrary number of queries to be active
+ * without additional GPU cost.
+ */
+struct gfx10_sh_query_buffer {
+       struct list_head list;
+       struct si_resource *buf;
+       unsigned refcount;
+
+       /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+       unsigned head;
+};
+
+/* Memory layout of the query buffer. Must be kept in sync with shaders
+ * (including QBO shaders) and should be aligned to cachelines.
+ *
+ * The somewhat awkward memory layout is for compatibility with the
+ * SET_PREDICATION packet, which also means that we're setting the high bit
+ * of all those values unconditionally.
+ */
+struct gfx10_sh_query_buffer_mem {
+       struct {
+               uint64_t generated_primitives_start_dummy;
+               uint64_t emitted_primitives_start_dummy;
+               uint64_t generated_primitives;
+               uint64_t emitted_primitives;
+       } stream[4];
+       uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+       uint32_t pad[31];
+};
+
+/* Shader-based queries. */
+struct gfx10_sh_query {
+       struct si_query b;
+
+       struct gfx10_sh_query_buffer *first;
+       struct gfx10_sh_query_buffer *last;
+       unsigned first_begin;
+       unsigned last_end;
+
+       unsigned stream;
+};
+
+static void emit_shader_query(struct si_context *sctx)
+{
+       assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers));
+
+       struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
+                                                            struct gfx10_sh_query_buffer, list);
+       qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+}
+
+static void gfx10_release_query_buffers(struct si_context *sctx,
+                                       struct gfx10_sh_query_buffer *first,
+                                       struct gfx10_sh_query_buffer *last)
+{
+       while (first) {
+               struct gfx10_sh_query_buffer *qbuf = first;
+               if (first != last)
+                       first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+               else
+                       first = NULL;
+
+               qbuf->refcount--;
+               if (qbuf->refcount)
+                       continue;
+
+               if (qbuf->list.next == &sctx->shader_query_buffers)
+                       continue; /* keep the most recent buffer; it may not be full yet */
+               if (qbuf->list.prev == &sctx->shader_query_buffers)
+                       continue; /* keep the oldest buffer for recycling */
+
+               LIST_DEL(&qbuf->list);
+               si_resource_reference(&qbuf->buf, NULL);
+               FREE(qbuf);
+       }
+}
+
+static bool gfx10_alloc_query_buffer(struct si_context *sctx)
+{
+       if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+               return true;
+
+       struct gfx10_sh_query_buffer *qbuf = NULL;
+
+       if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+               qbuf = list_last_entry(&sctx->shader_query_buffers,
+                                      struct gfx10_sh_query_buffer, list);
+               if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+                       goto success;
+
+               qbuf = list_first_entry(&sctx->shader_query_buffers,
+                                       struct gfx10_sh_query_buffer, list);
+               if (!qbuf->refcount &&
+                   !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+                   sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+                       /* Can immediately re-use the oldest buffer */
+                       LIST_DEL(&qbuf->list);
+               } else {
+                       qbuf = NULL;
+               }
+       }
+
+       if (!qbuf) {
+               qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+               if (unlikely(!qbuf))
+                       return false;
+
+               struct si_screen *screen = sctx->screen;
+               unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
+                                        screen->info.min_alloc_size);
+               qbuf->buf = si_resource(
+                       pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+               if (unlikely(!qbuf->buf)) {
+                       FREE(qbuf);
+                       return false;
+               }
+       }
+
+       /* The buffer is currently unused by the GPU. Initialize it.
+        *
+        * We need to set the high bit of all the primitive counters for
+        * compatibility with the SET_PREDICATION packet.
+        */
+       uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+                                                PIPE_TRANSFER_WRITE |
+                                                PIPE_TRANSFER_UNSYNCHRONIZED);
+       assert(results);
+
+       for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
+            i < e; ++i) {
+               for (unsigned j = 0; j < 16; ++j)
+                       results[32 * i + j] = (uint64_t)1 << 63;
+               results[32 * i + 16] = 0;
+       }
+
+       LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers);
+       qbuf->head = 0;
+       qbuf->refcount = sctx->num_active_shader_queries;
+
+success:;
+       struct pipe_shader_buffer sbuf;
+       sbuf.buffer = &qbuf->buf->b.b;
+       sbuf.buffer_offset = qbuf->head;
+       sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+       si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+       return true;
+}
+
+static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
+{
+       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+       gfx10_release_query_buffers(sctx, query->first, query->last);
+       FREE(query);
+}
+
+static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
+{
+       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+       gfx10_release_query_buffers(sctx, query->first, query->last);
+       query->first = query->last = NULL;
+
+       if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+               return false;
+
+       query->first = list_last_entry(&sctx->shader_query_buffers,
+                                      struct gfx10_sh_query_buffer, list);
+       query->first_begin = query->first->head;
+
+       sctx->num_active_shader_queries++;
+       query->first->refcount++;
+
+       return true;
+}
+
+static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
+{
+       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+       if (unlikely(!query->first))
+               return false; /* earlier out of memory error */
+
+       query->last = list_last_entry(&sctx->shader_query_buffers,
+                                     struct gfx10_sh_query_buffer, list);
+       query->last_end = query->last->head;
+
+       /* Signal the fence of the previous chunk */
+       if (query->last_end != 0) {
+               uint64_t fence_va = query->last->buf->gpu_address;
+               fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+               fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+               si_cp_release_mem(sctx, sctx->gfx_cs,
+                                 V_028A90_BOTTOM_OF_PIPE_TS, 0,
+                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                                 EOP_DATA_SEL_VALUE_32BIT,
+                                 query->last->buf, fence_va, 0xffffffff,
+                                 PIPE_QUERY_GPU_FINISHED);
+       }
+
+       sctx->num_active_shader_queries--;
+
+       if (sctx->num_active_shader_queries > 0) {
+               gfx10_alloc_query_buffer(sctx);
+       } else {
+               si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+
+               /* If a query_begin is followed by a query_end without a draw
+                * in-between, we need to clear the atom to ensure that the
+                * next query_begin will re-initialize the shader buffer. */
+               si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+       }
+
+       return true;
+}
+
+static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
+                                     struct gfx10_sh_query_buffer_mem *qmem,
+                                     union pipe_query_result *result)
+{
+       static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+       switch (query->b.type) {
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+               result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+               break;
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+               result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+               break;
+       case PIPE_QUERY_SO_STATISTICS:
+               result->so_statistics.num_primitives_written +=
+                       qmem->stream[query->stream].emitted_primitives & mask;
+               result->so_statistics.primitives_storage_needed +=
+                       qmem->stream[query->stream].generated_primitives & mask;
+               break;
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               result->b |= qmem->stream[query->stream].emitted_primitives !=
+                            qmem->stream[query->stream].generated_primitives;
+               break;
+       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+                       result->b |= qmem->stream[query->stream].emitted_primitives !=
+                                    qmem->stream[query->stream].generated_primitives;
+               }
+               break;
+       default:
+               assert(0);
+       }
+}
+
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
+                                     bool wait, union pipe_query_result *result)
+{
+       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+       util_query_clear_result(result, query->b.type);
+
+       if (unlikely(!query->first))
+               return false; /* earlier out of memory error */
+       assert(query->last);
+
+       for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+            qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+               unsigned usage = PIPE_TRANSFER_READ |
+                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+               void *map;
+
+               if (rquery->b.flushed)
+                       map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+               else
+                       map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+               if (!map)
+                       return false;
+
+               unsigned results_begin = 0;
+               unsigned results_end = qbuf->head;
+               if (qbuf == query->first)
+                       results_begin = query->first_begin;
+               if (qbuf == query->last)
+                       results_end = query->last_end;
+
+               while (results_begin != results_end) {
+                       struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+                       results_begin += sizeof(*qmem);
+
+                       gfx10_sh_query_add_result(query, qmem, result);
+               }
+
+               if (qbuf == query->first)
+                       break;
+       }
+
+       return true;
+}
+
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
+                                              struct si_query *rquery,
+                                              bool wait,
+                                              enum pipe_query_value_type result_type,
+                                              int index,
+                                              struct pipe_resource *resource,
+                                              unsigned offset)
+{
+       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+       struct si_qbo_state saved_state = {};
+       struct pipe_resource *tmp_buffer = NULL;
+       unsigned tmp_buffer_offset = 0;
+
+       if (!sctx->sh_query_result_shader) {
+               sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+               if (!sctx->sh_query_result_shader)
+                       return;
+       }
+
+       if (query->first != query->last) {
+               u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
+                                    &tmp_buffer_offset, &tmp_buffer);
+               if (!tmp_buffer)
+                       return;
+       }
+
+       si_save_qbo_state(sctx, &saved_state);
+
+       /* Pre-fill the constants configuring the shader behavior. */
+       struct {
+               uint32_t config;
+               uint32_t offset;
+               uint32_t chain;
+               uint32_t result_count;
+       } consts;
+       struct pipe_constant_buffer constant_buffer = {};
+
+       if (index >= 0) {
+               switch (query->b.type) {
+               case PIPE_QUERY_PRIMITIVES_GENERATED:
+                       consts.offset = sizeof(uint32_t) * query->stream;
+                       consts.config = 0;
+                       break;
+               case PIPE_QUERY_PRIMITIVES_EMITTED:
+                       consts.offset = sizeof(uint32_t) * (4 + query->stream);
+                       consts.config = 0;
+                       break;
+               case PIPE_QUERY_SO_STATISTICS:
+                       consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+                       consts.config = 0;
+                       break;
+               case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+                       consts.offset = sizeof(uint32_t) * query->stream;
+                       consts.config = 2;
+                       break;
+               case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+                       consts.offset = 0;
+                       consts.config = 3;
+                       break;
+               default: unreachable("bad query type");
+               }
+       } else {
+               /* Check result availability. */
+               consts.offset = 0;
+               consts.config = 1;
+       }
+
+       if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+               consts.config |= 8;
+
+       constant_buffer.buffer_size = sizeof(consts);
+       constant_buffer.user_buffer = &consts;
+
+       /* Pre-fill the SSBOs and grid. */
+       struct pipe_shader_buffer ssbo[3];
+       struct pipe_grid_info grid = {};
+
+       ssbo[1].buffer = tmp_buffer;
+       ssbo[1].buffer_offset = tmp_buffer_offset;
+       ssbo[1].buffer_size = 16;
+
+       ssbo[2] = ssbo[1];
+
+       sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+       grid.block[0] = 1;
+       grid.block[1] = 1;
+       grid.block[2] = 1;
+       grid.grid[0] = 1;
+       grid.grid[1] = 1;
+       grid.grid[2] = 1;
+
+       struct gfx10_sh_query_buffer *qbuf = query->first;
+       for (;;) {
+               unsigned begin = qbuf == query->first ? query->first_begin : 0;
+               unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+               if (!end)
+                       continue;
+
+               ssbo[0].buffer = &qbuf->buf->b.b;
+               ssbo[0].buffer_offset = begin;
+               ssbo[0].buffer_size = end - begin;
+
+               consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+               consts.chain = 0;
+               if (qbuf != query->first)
+                       consts.chain |= 1;
+               if (qbuf != query->last)
+                       consts.chain |= 2;
+
+               if (qbuf == query->last) {
+                       ssbo[2].buffer = resource;
+                       ssbo[2].buffer_offset = offset;
+                       ssbo[2].buffer_size = 8;
+               }
+
+               sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+               sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+               if (wait) {
+                       uint64_t va;
+
+                       /* Wait for result availability. Wait only for readiness
+                        * of the last entry, since the fence writes should be
+                        * serialized in the CP.
+                        */
+                       va = qbuf->buf->gpu_address;
+                       va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+                       va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+                       si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+               }
+
+               sctx->b.launch_grid(&sctx->b, &grid);
+               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+               if (qbuf == query->last)
+                       break;
+               qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+       }
+
+       si_restore_qbo_state(sctx, &saved_state);
+       pipe_resource_reference(&tmp_buffer, NULL);
+}
+
+static const struct si_query_ops gfx10_sh_query_ops = {
+       .destroy = gfx10_sh_query_destroy,
+       .begin = gfx10_sh_query_begin,
+       .end = gfx10_sh_query_end,
+       .get_result = gfx10_sh_query_get_result,
+       .get_result_resource = gfx10_sh_query_get_result_resource,
+};
+
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+                                        enum pipe_query_type query_type,
+                                        unsigned index)
+{
+       struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+       if (unlikely(!query))
+               return NULL;
+
+       query->b.ops = &gfx10_sh_query_ops;
+       query->b.type = query_type;
+       query->stream = index;
+
+       return (struct pipe_query *)query;
+}
+
+void gfx10_init_query(struct si_context *sctx)
+{
+       LIST_INITHEAD(&sctx->shader_query_buffers);
+       sctx->atoms.s.shader_query.emit = emit_shader_query;
+}
+
+void gfx10_destroy_query(struct si_context *sctx)
+{
+       while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+               struct gfx10_sh_query_buffer *qbuf =
+                       list_first_entry(&sctx->shader_query_buffers,
+                                        struct gfx10_sh_query_buffer, list);
+               LIST_DEL(&qbuf->list);
+
+               assert(!qbuf->refcount);
+               si_resource_reference(&qbuf->buf, NULL);
+               FREE(qbuf);
+       }
+}
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

index 87ca56b1fdf413f8674b24129fdde701a9775079..c97d9009164d5dc99eae8dffb00effdada4bf197 100644 (file)
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -64,6 +64,15 @@ static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
                             false);
  }
  
+static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
+{
+       LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
+                                           ctx->param_rw_buffers);
+
+       return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+                                    LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false));
+}
+
  /* Send GS Alloc Req message from the first wave of the group to SPI.
   * Message payload is:
   * - bits 0..10: vertices in group
@@ -209,6 +218,27 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
  
         build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
  
+       /* Update query buffer */
+       tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+       ac_build_ifcc(&ctx->ac, tmp, 5030);
+       tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+       ac_build_ifcc(&ctx->ac, tmp, 5031);
+       {
+               LLVMValueRef args[] = {
+                       ngg_get_prim_cnt(ctx),
+                       ngg_get_query_buf(ctx),
+                       LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */
+                       ctx->i32_0, /* soffset */
+                       ctx->i32_0, /* cachepolicy */
+               };
+
+               /* TODO: should this be 64-bit atomics? */
+               ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+                                  ctx->i32, args, 5, 0);
+       }
+       ac_build_endif(&ctx->ac, 5031);
+       ac_build_endif(&ctx->ac, 5030);
+
         /* Export primitive data to the index buffer. Format is:
          *  - bits 0..8: index 0
          *  - bit 9: edge flag 0
@@ -431,9 +461,34 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
         tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
         LLVMBuildStore(builder, tmp, primflagptr);
  
+       tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+       tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+       LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
         lp_build_endif(&if_state);
  }
  
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
+{
+       /* Zero out the part of LDS scratch that is used to accumulate the
+        * per-stream generated primitive count.
+        */
+       LLVMBuilderRef builder = ctx->ac.builder;
+       LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+       LLVMValueRef tid = get_thread_id_in_tg(ctx);
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+       ac_build_ifcc(&ctx->ac, tmp, 5090);
+       {
+               LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+               LLVMBuildStore(builder, ctx->i32_0, ptr);
+       }
+       ac_build_endif(&ctx->ac, 5090);
+
+       ac_build_s_barrier(&ctx->ac);
+}
+
  void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
  {
         const struct si_shader_selector *sel = ctx->shader->selector;
@@ -481,6 +536,26 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
                 ac_build_endloop(&ctx->ac, 5100);
         }
  
+       /* Accumulate generated primitives counts across the entire threadgroup. */
+       for (unsigned stream = 0; stream < 4; ++stream) {
+               if (!info->num_stream_output_components[stream])
+                       continue;
+
+               LLVMValueRef numprims =
+                       LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+               numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64);
+
+               tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, "");
+               ac_build_ifcc(&ctx->ac, tmp, 5105);
+               {
+                       LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+                                          ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
+                                                        LLVMConstInt(ctx->i32, stream, false)),
+                                          numprims, LLVMAtomicOrderingMonotonic, false);
+               }
+               ac_build_endif(&ctx->ac, 5105);
+       }
+
         lp_build_endif(&ctx->merged_wrap_if_state);
  
         ac_build_s_barrier(&ctx->ac);
@@ -490,6 +565,33 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
  
         /* TODO: streamout */
  
+       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+       ac_build_ifcc(&ctx->ac, tmp, 5110);
+       {
+               LLVMValueRef offset;
+               tmp = tid;
+               if (sel->so.num_outputs)
+                       tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), "");
+               offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), "");
+               if (sel->so.num_outputs) {
+                       tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), "");
+                       tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), "");
+                       offset = LLVMBuildAdd(builder, offset, tmp, "");
+               }
+
+               tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+               LLVMValueRef args[] = {
+                       tmp,
+                       ngg_get_query_buf(ctx),
+                       offset,
+                       LLVMConstInt(ctx->i32, 16, false), /* soffset */
+                       ctx->i32_0, /* cachepolicy */
+               };
+               ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+                                  ctx->i32, args, 5, 0);
+       }
+       ac_build_endif(&ctx->ac, 5110);
+
         /* TODO: culling */
  
         /* Determine vertex liveness. */
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build

index 0ca065f34e0cc4c6b0ee7ad315f0a33675cb2c54..a362c207776cdc071881cb05db82f98a45d72ab3 100644 (file)
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -21,6 +21,7 @@
  files_libradeonsi = files(
    'cik_sdma.c',
    'driinfo_radeonsi.h',
+  'gfx10_query.c',
    'gfx10_shader_ngg.c',
    'si_blit.c',
    'si_buffer.c',
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index c2cee0249825310e3bb9fb730010310619b75ed9..91b474d4d8f7acb89c997837dddafd6391c84962 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -168,6 +168,9 @@ static void si_destroy_context(struct pipe_context *context)
  
         si_release_all_descriptors(sctx);
  
+       if (sctx->chip_class >= GFX10)
+               gfx10_destroy_query(sctx);
+
         pipe_resource_reference(&sctx->esgs_ring, NULL);
         pipe_resource_reference(&sctx->gsvs_ring, NULL);
         pipe_resource_reference(&sctx->tess_rings, NULL);
@@ -239,6 +242,8 @@ static void si_destroy_context(struct pipe_context *context)
  
         if (sctx->query_result_shader)
                 sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+       if (sctx->sh_query_result_shader)
+               sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
  
         if (sctx->gfx_cs)
                 sctx->ws->cs_destroy(sctx->gfx_cs);
@@ -516,6 +521,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
         /* Initialize graphics-only context functions. */
         if (sctx->has_graphics) {
                 si_init_context_texture_functions(sctx);
+               if (sctx->chip_class >= GFX10)
+                       gfx10_init_query(sctx);
                 si_init_msaa_functions(sctx);
                 si_init_shader_functions(sctx);
                 si_init_state_functions(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index a351e5004b119578d72ae991dda42e0d88ef0907..874b1bf4cd0fab6b3532fafcaa95880318e27556 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -868,6 +868,7 @@ struct si_context {
         struct pipe_device_reset_callback device_reset_callback;
         struct u_log_context            *log;
         void                            *query_result_shader;
+       void                            *sh_query_result_shader;
  
         void (*emit_cache_flush)(struct si_context *ctx);
  
@@ -1178,6 +1179,10 @@ struct si_context {
         unsigned                        num_sdma_uploads;
         unsigned                        max_sdma_uploads;
  
+       /* Shader-based queries. */
+       struct list_head                shader_query_buffers;
+       unsigned                        num_active_shader_queries;
+
         /* Statistics gathering for the DCC enablement heuristic. It can't be
          * in si_texture because si_texture can be shared by multiple
          * contexts. This is for back buffers only. We shouldn't get too many
@@ -1439,6 +1444,11 @@ void *si_clear_render_target_shader(struct pipe_context *ctx);
  void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
  void *si_create_dcc_retile_cs(struct pipe_context *ctx);
  void *si_create_query_result_cs(struct si_context *sctx);
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
+
+/* gfx10_query.c */
+void gfx10_init_query(struct si_context *sctx);
+void gfx10_destroy_query(struct si_context *sctx);
  
  /* si_test_dma.c */
  void si_test_dma(struct si_screen *sscreen);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c

index ae6498e1895cafb7b51ce839cb08d3f502c100bc..394bf7ff12477a79d670bc9c424134a358c182e3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -32,8 +32,6 @@
  #include "util/u_suballoc.h"
  #include "amd/common/sid.h"
  
-#define SI_MAX_STREAMS 4
-
  static const struct si_query_ops query_hw_ops;
  
  struct si_hw_query_params {
@@ -1015,6 +1013,12 @@ static void si_emit_query_predication(struct si_context *ctx)
         if (!query)
                 return;
  
+       if (ctx->chip_class == GFX10 &&
+           (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+            query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+               assert(!"not implemented");
+       }
+
         invert = ctx->render_cond_invert;
         flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
                     ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
@@ -1096,6 +1100,14 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que
              query_type != SI_QUERY_TIME_ELAPSED_SDMA))
                 return si_query_sw_create(query_type);
  
+       if (sscreen->info.chip_class >= GFX10 &&
+           (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+            query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
+            query_type == PIPE_QUERY_SO_STATISTICS ||
+            query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+            query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+               return gfx10_sh_query_create(sscreen, query_type, index);
+
         return si_query_hw_create(sscreen, query_type, index);
  }
  
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h

index 82e5e25ed0074285c58ffd355c260f643d47cb9d..dc219f8551c076069104f4e1d9c6f7f4e7301869 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -38,6 +38,8 @@ struct si_query_buffer;
  struct si_query_hw;
  struct si_resource;
  
+#define SI_MAX_STREAMS 4
+
  enum {
         SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
         SI_QUERY_DECOMPRESS_CALLS,
@@ -228,6 +230,12 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
  void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
  
  
+/* Shader-based queries */
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+                                        enum pipe_query_type query_type,
+                                        unsigned index);
+
+
  /* Performance counters */
  struct si_perfcounters {
         unsigned num_groups;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index 9abecdf1003e1bed06b73fd7fb01d96fd3d1da74..68506b7a92cb105e061a578224071af006ec6b4b 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -6082,6 +6082,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                         for (unsigned i = 0; i < 4; ++i) {
                                 ctx->gs_curprim_verts[i] =
                                         lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
+                               ctx->gs_generated_prims[i] =
+                                       lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
                         }
  
                         LLVMTypeRef a8i32 = LLVMArrayType(ctx->i32, 8);
@@ -6135,9 +6137,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
  
                         if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                             ctx->type == PIPE_SHADER_GEOMETRY) {
+                               if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+                                       gfx10_ngg_gs_emit_prologue(ctx);
+                                       nested_barrier = false;
+                               } else {
+                                       nested_barrier = true;
+                               }
+
                                 /* Number of patches / primitives */
                                 num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
-                               nested_barrier = true;
                         } else {
                                 /* Number of vertices */
                                 num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h

index 09efc91b9f592ce003b5d06835c0a0e0e02ccebc..7832f75ef65863b5a38906b670ccfc2160a247a6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -214,6 +214,7 @@ struct si_shader_context {
         LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
         LLVMValueRef gs_next_vertex[4];
         LLVMValueRef gs_curprim_verts[4];
+       LLVMValueRef gs_generated_prims[4];
         LLVMValueRef gs_ngg_emit;
         LLVMValueRef gs_ngg_scratch;
         LLVMValueRef postponed_kill;
@@ -388,6 +389,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
  void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
                               unsigned stream,
                               LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
  void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
  void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
  
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c

index b68fd2ff236468cbf3e7f8c33198049f8ffc3e9a..9f2f9d30216a2f041a7e299a46a177eb011ac4b5 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -660,3 +660,228 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
  
         return ctx->create_compute_state(ctx, &state);
  }
+
+/* Create the compute shader that is used to collect the results of gfx10+
+ * shader queries.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ *
+ * CONST
+ *  0.x = config; the low 3 bits indicate the mode:
+ *          0: sum up counts
+ *          1: determine result availability and write it as a boolean
+ *          2: SO_OVERFLOW
+ *          3: SO_ANY_OVERFLOW
+ *        the remaining bits form a bitfield:
+ *          8: write result as a 64-bit value
+ *  0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
+ *  0.z = chain bit field:
+ *          1: have previous summary buffer
+ *          2: write next summary buffer
+ *  0.w = result_count
+ */
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
+{
+       /* TEMP[0].x = accumulated result so far
+        * TEMP[0].y = result missing
+        * TEMP[0].z = whether we're in overflow mode
+        */
+       static const char text_tmpl[] =
+               "COMP\n"
+               "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+               "DCL BUFFER[0]\n"
+               "DCL BUFFER[1]\n"
+               "DCL BUFFER[2]\n"
+               "DCL CONST[0][0..0]\n"
+               "DCL TEMP[0..5]\n"
+               "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+               "IMM[1] UINT32 {1, 2, 4, 8}\n"
+               "IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+               /*
+               acc_result = 0;
+               acc_missing = 0;
+               if (chain & 1) {
+                       acc_result = buffer[1][0];
+                       acc_missing = buffer[1][1];
+               }
+               */
+               "MOV TEMP[0].xy, IMM[0].xxxx\n"
+               "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+               "UIF TEMP[5]\n"
+                       "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+               "ENDIF\n"
+
+               /*
+               is_overflow (TEMP[0].z) = (config & 7) >= 2;
+               result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
+               base_offset (TEMP[1].y) = 0;
+               for (;;) {
+                       if (!result_remaining)
+                               break;
+                       result_remaining--;
+               */
+               "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+               "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+               "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+               "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+               "MOV TEMP[1].y, IMM[0].xxxx\n"
+
+               "BGNLOOP\n"
+                       "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+                       "UIF TEMP[5]\n"
+                               "BRK\n"
+                       "ENDIF\n"
+                       "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+                       /*
+                       fence = buffer[0]@(base_offset + 32);
+                       if (!fence) {
+                               acc_missing = ~0u;
+                               break;
+                       }
+                       */
+                       "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+                       "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+                       "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+                       "UIF TEMP[5]\n"
+                               "MOV TEMP[0].y, TEMP[5].xxxx\n"
+                               "BRK\n"
+                       "ENDIF\n"
+
+                       /*
+                       stream_offset (TEMP[2].x) = base_offset + offset;
+
+                       if (!(config & 7)) {
+                               acc_result += buffer[0]@stream_offset;
+                       }
+                       */
+                       "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+                       "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                       "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+                       "UIF TEMP[5]\n"
+                               "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+                               "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+                       "ENDIF\n"
+
+                       /*
+                       if ((config & 7) >= 2) {
+                               count (TEMP[2].y) = (config & 1) ? 4 : 1;
+                       */
+                       "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                       "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+                       "UIF TEMP[5]\n"
+                               "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+                               "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+                               /*
+                               do {
+                                       generated = buffer[0]@stream_offset;
+                                       emitted = buffer[0]@(stream_offset + 16);
+                                       if (generated != emitted) {
+                                               acc_result = 1;
+                                               result_remaining = 0;
+                                               break;
+                                       }
+
+                                       stream_offset += 4;
+                               } while (--count);
+                               */
+                               "BGNLOOP\n"
+                                       "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+                                       "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+                                       "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+                                       "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+                                       "UIF TEMP[5]\n"
+                                               "MOV TEMP[0].x, IMM[1].xxxx\n"
+                                               "MOV TEMP[1].y, IMM[0].xxxx\n"
+                                               "BRK\n"
+                                       "ENDIF\n"
+
+                                       "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+                                       "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+                                       "UIF TEMP[5]\n"
+                                               "BRK\n"
+                                       "ENDIF\n"
+                                       "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+                               "ENDLOOP\n"
+                       "ENDIF\n"
+
+               /*
+                       base_offset += 64;
+               } // end outer loop
+               */
+                       "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+               "ENDLOOP\n"
+
+               /*
+               if (chain & 2) {
+                       buffer[2][0] = acc_result;
+                       buffer[2][1] = acc_missing;
+               } else {
+               */
+               "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+               "UIF TEMP[5]\n"
+                       "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+               "ELSE\n"
+
+                       /*
+                       if ((config & 7) == 1) {
+                               acc_result = acc_missing ? 0 : 1;
+                               acc_missing = 0;
+                       }
+                       */
+                       "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+                       "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+                       "UIF TEMP[5]\n"
+                               "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+                               "MOV TEMP[0].y, IMM[0].xxxx\n"
+                       "ENDIF\n"
+
+                       /*
+                       if (!acc_missing) {
+                               buffer[2][0] = acc_result;
+                               if (config & 8)
+                                       buffer[2][1] = 0;
+                       }
+                       */
+                       "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+                       "UIF TEMP[5]\n"
+                               "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+                               "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+                               "UIF TEMP[5]\n"
+                                       "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+                               "ENDIF\n"
+                       "ENDIF\n"
+               "ENDIF\n"
+
+               "END\n";
+
+       struct tgsi_token tokens[1024];
+       struct pipe_compute_state state = {};
+
+       if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+               assert(false);
+               return NULL;
+       }
+
+       state.ir_type = PIPE_SHADER_IR_TGSI;
+       state.prog = tokens;
+
+       return sctx->b.create_compute_state(&sctx->b, &state);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h

index 678f87cd73d403f7f00b8ba2116965a9a2f2ed54..757dd1bf5cd5e092f5af57d9f9f2d4d3cc0172f9 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,7 @@ union si_state_atoms {
                 struct si_atom spi_map;
                 struct si_atom scratch_state;
                 struct si_atom window_rectangles;
+               struct si_atom shader_query;
         } s;
         struct si_atom array[0];
  };
@@ -370,6 +371,8 @@ enum {
         SI_PS_IMAGE_COLORBUF0_FMASK,
         SI_PS_IMAGE_COLORBUF0_FMASK_HI,
  
+       GFX10_GS_QUERY_BUF,
+
         SI_NUM_RW_BUFFERS,
  };
author	Nicolai Hähnle <nicolai.haehnle@amd.com>
	Wed, 19 Sep 2018 12:53:35 +0000 (14:53 +0200)
committer	Marek Olšák <marek.olsak@amd.com>
	Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
src/gallium/drivers/radeonsi/Makefile.sources		patch \| blob \| history
src/gallium/drivers/radeonsi/gfx10_query.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c		patch \| blob \| history
src/gallium/drivers/radeonsi/meson.build		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_query.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_query.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_internal.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.h		patch \| blob \| history