From: Nicolai Hähnle <nicolai.haehnle@amd.com>
Date: Wed, 19 Sep 2018 12:53:35 +0000 (+0200)
Subject: radeonsi/gfx10: implement streamout-related queries
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=792a638b032d16fbe6404f9d90c34b3e0f1fb0b5;p=mesa.git

radeonsi/gfx10: implement streamout-related queries

The NGG hardware pipeline doesn't track these statistics automatically,
and in fact *cannot* track them automatically when API geometry shaders
are involved, so we accumulate statistics in the shader using atomic
adds.

This implementation accumulates statistics via the memory system and
the RW buffer descriptor setup. We could use GDS, but since these
atomics aren't latency-sensitive, that basically just trades off
L2$ bandwidth vs. export bus bandwidth. One single memory transaction
per shader workgroup doesn't seem too bad. The result ring buffer in
memory is needed either way to avoid pipeline stalls.

The shader code contains the atomic unconditionally, though the
GFX10_GS_QUERY_BUF is a null buffer when no queries are active. The
atomic is simply discarded by the shader hardware in that case.

Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
---

diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 83cca397716..f25309736c9 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -6,6 +6,7 @@ C_SOURCES := \
 	$(GENERATED_SOURCES) \
 	cik_sdma.c \
 	driinfo_radeonsi.h \
+	gfx10_query.c \
 	gfx10_shader_ngg.c \
 	si_blit.c \
 	si_buffer.c \
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c
new file mode 100644
index 00000000000..8584b2af505
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stddef.h>
+
+#include "si_pipe.h"
+#include "si_query.h"
+#include "util/u_memory.h"
+#include "util/u_suballoc.h"
+#include "sid.h"
+
+/**
+ * The query buffer is written to by ESGS NGG shaders with statistics about
+ * generated and (streamout-)emitted primitives.
+ *
+ * The context maintains a ring of these query buffers, and queries simply
+ * point into the ring, allowing an arbitrary number of queries to be active
+ * without additional GPU cost.
+ */
+struct gfx10_sh_query_buffer {
+	struct list_head list;
+	struct si_resource *buf;
+	unsigned refcount;
+
+	/* Offset into the buffer in bytes; points at the first un-emitted entry. */
+	unsigned head;
+};
+
+/* Memory layout of the query buffer. Must be kept in sync with shaders
+ * (including QBO shaders) and should be aligned to cachelines.
+ *
+ * The somewhat awkward memory layout is for compatibility with the
+ * SET_PREDICATION packet, which also means that we're setting the high bit
+ * of all those values unconditionally.
+ */
+struct gfx10_sh_query_buffer_mem {
+	struct {
+		uint64_t generated_primitives_start_dummy;
+		uint64_t emitted_primitives_start_dummy;
+		uint64_t generated_primitives;
+		uint64_t emitted_primitives;
+	} stream[4];
+	uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+	uint32_t pad[31];
+};
+
+/* Shader-based queries. */
+struct gfx10_sh_query {
+	struct si_query b;
+
+	struct gfx10_sh_query_buffer *first;
+	struct gfx10_sh_query_buffer *last;
+	unsigned first_begin;
+	unsigned last_end;
+
+	unsigned stream;
+};
+
+static void emit_shader_query(struct si_context *sctx)
+{
+	assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers));
+
+	struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
+							     struct gfx10_sh_query_buffer, list);
+	qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+}
+
+static void gfx10_release_query_buffers(struct si_context *sctx,
+					struct gfx10_sh_query_buffer *first,
+					struct gfx10_sh_query_buffer *last)
+{
+	while (first) {
+		struct gfx10_sh_query_buffer *qbuf = first;
+		if (first != last)
+			first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+		else
+			first = NULL;
+
+		qbuf->refcount--;
+		if (qbuf->refcount)
+			continue;
+
+		if (qbuf->list.next == &sctx->shader_query_buffers)
+			continue; /* keep the most recent buffer; it may not be full yet */
+		if (qbuf->list.prev == &sctx->shader_query_buffers)
+			continue; /* keep the oldest buffer for recycling */
+
+		LIST_DEL(&qbuf->list);
+		si_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
+	}
+}
+
+static bool gfx10_alloc_query_buffer(struct si_context *sctx)
+{
+	if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+		return true;
+
+	struct gfx10_sh_query_buffer *qbuf = NULL;
+
+	if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+		qbuf = list_last_entry(&sctx->shader_query_buffers,
+				       struct gfx10_sh_query_buffer, list);
+		if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+			goto success;
+
+		qbuf = list_first_entry(&sctx->shader_query_buffers,
+				        struct gfx10_sh_query_buffer, list);
+		if (!qbuf->refcount &&
+		    !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+		    sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+			/* Can immediately re-use the oldest buffer */
+			LIST_DEL(&qbuf->list);
+		} else {
+			qbuf = NULL;
+		}
+	}
+
+	if (!qbuf) {
+		qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+		if (unlikely(!qbuf))
+			return false;
+
+		struct si_screen *screen = sctx->screen;
+		unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
+					 screen->info.min_alloc_size);
+		qbuf->buf = si_resource(
+			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+		if (unlikely(!qbuf->buf)) {
+			FREE(qbuf);
+			return false;
+		}
+	}
+
+	/* The buffer is currently unused by the GPU. Initialize it.
+	 *
+	 * We need to set the high bit of all the primitive counters for
+	 * compatibility with the SET_PREDICATION packet.
+	 */
+	uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+						 PIPE_TRANSFER_WRITE |
+						 PIPE_TRANSFER_UNSYNCHRONIZED);
+	assert(results);
+
+	for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
+	     i < e; ++i) {
+		for (unsigned j = 0; j < 16; ++j)
+			results[32 * i + j] = (uint64_t)1 << 63;
+		results[32 * i + 16] = 0;
+	}
+
+	LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers);
+	qbuf->head = 0;
+	qbuf->refcount = sctx->num_active_shader_queries;
+
+success:;
+	struct pipe_shader_buffer sbuf;
+	sbuf.buffer = &qbuf->buf->b.b;
+	sbuf.buffer_offset = qbuf->head;
+	sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+	si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+	return true;
+}
+
+static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+	gfx10_release_query_buffers(sctx, query->first, query->last);
+	FREE(query);
+}
+
+static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+	gfx10_release_query_buffers(sctx, query->first, query->last);
+	query->first = query->last = NULL;
+
+	if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+		return false;
+
+	query->first = list_last_entry(&sctx->shader_query_buffers,
+				       struct gfx10_sh_query_buffer, list);
+	query->first_begin = query->first->head;
+
+	sctx->num_active_shader_queries++;
+	query->first->refcount++;
+
+	return true;
+}
+
+static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+	if (unlikely(!query->first))
+		return false; /* earlier out of memory error */
+
+	query->last = list_last_entry(&sctx->shader_query_buffers,
+				      struct gfx10_sh_query_buffer, list);
+	query->last_end = query->last->head;
+
+	/* Signal the fence of the previous chunk */
+	if (query->last_end != 0) {
+		uint64_t fence_va = query->last->buf->gpu_address;
+		fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+		fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+		si_cp_release_mem(sctx, sctx->gfx_cs,
+				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
+				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+				  EOP_DATA_SEL_VALUE_32BIT,
+				  query->last->buf, fence_va, 0xffffffff,
+				  PIPE_QUERY_GPU_FINISHED);
+	}
+
+	sctx->num_active_shader_queries--;
+
+	if (sctx->num_active_shader_queries > 0) {
+		gfx10_alloc_query_buffer(sctx);
+	} else {
+		si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+
+		/* If a query_begin is followed by a query_end without a draw
+		 * in-between, we need to clear the atom to ensure that the
+		 * next query_begin will re-initialize the shader buffer. */
+		si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+	}
+
+	return true;
+}
+
+static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
+				      struct gfx10_sh_query_buffer_mem *qmem,
+				      union pipe_query_result *result)
+{
+	static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+	switch (query->b.type) {
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+		result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+		break;
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+		result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+		break;
+	case PIPE_QUERY_SO_STATISTICS:
+		result->so_statistics.num_primitives_written +=
+			qmem->stream[query->stream].emitted_primitives & mask;
+		result->so_statistics.primitives_storage_needed +=
+			qmem->stream[query->stream].generated_primitives & mask;
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		result->b |= qmem->stream[query->stream].emitted_primitives !=
+			     qmem->stream[query->stream].generated_primitives;
+		break;
+	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+			result->b |= qmem->stream[query->stream].emitted_primitives !=
+				     qmem->stream[query->stream].generated_primitives;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
+				      bool wait, union pipe_query_result *result)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+	util_query_clear_result(result, query->b.type);
+
+	if (unlikely(!query->first))
+		return false; /* earlier out of memory error */
+	assert(query->last);
+
+	for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+	     qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+		unsigned usage = PIPE_TRANSFER_READ |
+				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+		void *map;
+
+		if (rquery->b.flushed)
+			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+		else
+			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+		if (!map)
+			return false;
+
+		unsigned results_begin = 0;
+		unsigned results_end = qbuf->head;
+		if (qbuf == query->first)
+			results_begin = query->first_begin;
+		if (qbuf == query->last)
+			results_end = query->last_end;
+
+		while (results_begin != results_end) {
+			struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+			results_begin += sizeof(*qmem);
+
+			gfx10_sh_query_add_result(query, qmem, result);
+		}
+
+		if (qbuf == query->first)
+			break;
+	}
+
+	return true;
+}
+
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
+					       struct si_query *rquery,
+					       bool wait,
+					       enum pipe_query_value_type result_type,
+					       int index,
+					       struct pipe_resource *resource,
+					       unsigned offset)
+{
+	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+	struct si_qbo_state saved_state = {};
+	struct pipe_resource *tmp_buffer = NULL;
+	unsigned tmp_buffer_offset = 0;
+
+	if (!sctx->sh_query_result_shader) {
+		sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+		if (!sctx->sh_query_result_shader)
+			return;
+	}
+
+	if (query->first != query->last) {
+		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
+				     &tmp_buffer_offset, &tmp_buffer);
+		if (!tmp_buffer)
+			return;
+	}
+
+	si_save_qbo_state(sctx, &saved_state);
+
+	/* Pre-fill the constants configuring the shader behavior. */
+	struct {
+		uint32_t config;
+		uint32_t offset;
+		uint32_t chain;
+		uint32_t result_count;
+	} consts;
+	struct pipe_constant_buffer constant_buffer = {};
+
+	if (index >= 0) {
+		switch (query->b.type) {
+		case PIPE_QUERY_PRIMITIVES_GENERATED:
+			consts.offset = sizeof(uint32_t) * query->stream;
+			consts.config = 0;
+			break;
+		case PIPE_QUERY_PRIMITIVES_EMITTED:
+			consts.offset = sizeof(uint32_t) * (4 + query->stream);
+			consts.config = 0;
+			break;
+		case PIPE_QUERY_SO_STATISTICS:
+			consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+			consts.config = 0;
+			break;
+		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+			consts.offset = sizeof(uint32_t) * query->stream;
+			consts.config = 2;
+			break;
+		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+			consts.offset = 0;
+			consts.config = 3;
+			break;
+		default: unreachable("bad query type");
+		}
+	} else {
+		/* Check result availability. */
+		consts.offset = 0;
+		consts.config = 1;
+	}
+
+	if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+		consts.config |= 8;
+
+	constant_buffer.buffer_size = sizeof(consts);
+	constant_buffer.user_buffer = &consts;
+
+	/* Pre-fill the SSBOs and grid. */
+	struct pipe_shader_buffer ssbo[3];
+	struct pipe_grid_info grid = {};
+
+	ssbo[1].buffer = tmp_buffer;
+	ssbo[1].buffer_offset = tmp_buffer_offset;
+	ssbo[1].buffer_size = 16;
+
+	ssbo[2] = ssbo[1];
+
+	sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+	grid.block[0] = 1;
+	grid.block[1] = 1;
+	grid.block[2] = 1;
+	grid.grid[0] = 1;
+	grid.grid[1] = 1;
+	grid.grid[2] = 1;
+
+	struct gfx10_sh_query_buffer *qbuf = query->first;
+	for (;;) {
+		unsigned begin = qbuf == query->first ? query->first_begin : 0;
+		unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+		if (!end)
+			continue;
+
+		ssbo[0].buffer = &qbuf->buf->b.b;
+		ssbo[0].buffer_offset = begin;
+		ssbo[0].buffer_size = end - begin;
+
+		consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+		consts.chain = 0;
+		if (qbuf != query->first)
+			consts.chain |= 1;
+		if (qbuf != query->last)
+			consts.chain |= 2;
+
+		if (qbuf == query->last) {
+			ssbo[2].buffer = resource;
+			ssbo[2].buffer_offset = offset;
+			ssbo[2].buffer_size = 8;
+		}
+
+		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+		if (wait) {
+			uint64_t va;
+
+			/* Wait for result availability. Wait only for readiness
+			 * of the last entry, since the fence writes should be
+			 * serialized in the CP.
+			 */
+			va = qbuf->buf->gpu_address;
+			va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+			va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+			si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+		}
+
+		sctx->b.launch_grid(&sctx->b, &grid);
+		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+		if (qbuf == query->last)
+			break;
+		qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+	}
+
+	si_restore_qbo_state(sctx, &saved_state);
+	pipe_resource_reference(&tmp_buffer, NULL);
+}
+
+static const struct si_query_ops gfx10_sh_query_ops = {
+	.destroy = gfx10_sh_query_destroy,
+	.begin = gfx10_sh_query_begin,
+	.end = gfx10_sh_query_end,
+	.get_result = gfx10_sh_query_get_result,
+	.get_result_resource = gfx10_sh_query_get_result_resource,
+};
+
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+					 enum pipe_query_type query_type,
+					 unsigned index)
+{
+	struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+	if (unlikely(!query))
+		return NULL;
+
+	query->b.ops = &gfx10_sh_query_ops;
+	query->b.type = query_type;
+	query->stream = index;
+
+	return (struct pipe_query *)query;
+}
+
+void gfx10_init_query(struct si_context *sctx)
+{
+	LIST_INITHEAD(&sctx->shader_query_buffers);
+	sctx->atoms.s.shader_query.emit = emit_shader_query;
+}
+
+void gfx10_destroy_query(struct si_context *sctx)
+{
+	while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
+		struct gfx10_sh_query_buffer *qbuf =
+			list_first_entry(&sctx->shader_query_buffers,
+					 struct gfx10_sh_query_buffer, list);
+		LIST_DEL(&qbuf->list);
+
+		assert(!qbuf->refcount);
+		si_resource_reference(&qbuf->buf, NULL);
+		FREE(qbuf);
+	}
+}
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 87ca56b1fdf..c97d9009164 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -64,6 +64,15 @@ static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
 			    false);
 }
 
+static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
+{
+	LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
+					    ctx->param_rw_buffers);
+
+	return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+				     LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false));
+}
+
 /* Send GS Alloc Req message from the first wave of the group to SPI.
  * Message payload is:
  * - bits 0..10: vertices in group
@@ -209,6 +218,27 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
 
 	build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
 
+	/* Update query buffer */
+	tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+	ac_build_ifcc(&ctx->ac, tmp, 5030);
+	tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+	ac_build_ifcc(&ctx->ac, tmp, 5031);
+	{
+		LLVMValueRef args[] = {
+			ngg_get_prim_cnt(ctx),
+			ngg_get_query_buf(ctx),
+			LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */
+			ctx->i32_0, /* soffset */
+			ctx->i32_0, /* cachepolicy */
+		};
+
+		/* TODO: should this be 64-bit atomics? */
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+				   ctx->i32, args, 5, 0);
+	}
+	ac_build_endif(&ctx->ac, 5031);
+	ac_build_endif(&ctx->ac, 5030);
+
 	/* Export primitive data to the index buffer. Format is:
 	 *  - bits 0..8: index 0
 	 *  - bit 9: edge flag 0
@@ -431,9 +461,34 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
 	tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
 	LLVMBuildStore(builder, tmp, primflagptr);
 
+	tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+	tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+	LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
 	lp_build_endif(&if_state);
 }
 
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
+{
+	/* Zero out the part of LDS scratch that is used to accumulate the
+	 * per-stream generated primitive count.
+	 */
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+	LLVMValueRef tid = get_thread_id_in_tg(ctx);
+	LLVMValueRef tmp;
+
+	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+	ac_build_ifcc(&ctx->ac, tmp, 5090);
+	{
+		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+		LLVMBuildStore(builder, ctx->i32_0, ptr);
+	}
+	ac_build_endif(&ctx->ac, 5090);
+
+	ac_build_s_barrier(&ctx->ac);
+}
+
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 {
 	const struct si_shader_selector *sel = ctx->shader->selector;
@@ -481,6 +536,26 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 		ac_build_endloop(&ctx->ac, 5100);
 	}
 
+	/* Accumulate generated primitives counts across the entire threadgroup. */
+	for (unsigned stream = 0; stream < 4; ++stream) {
+		if (!info->num_stream_output_components[stream])
+			continue;
+
+		LLVMValueRef numprims =
+			LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+		numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64);
+
+		tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, "");
+		ac_build_ifcc(&ctx->ac, tmp, 5105);
+		{
+			LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+					   ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
+							 LLVMConstInt(ctx->i32, stream, false)),
+					   numprims, LLVMAtomicOrderingMonotonic, false);
+		}
+		ac_build_endif(&ctx->ac, 5105);
+	}
+
 	lp_build_endif(&ctx->merged_wrap_if_state);
 
 	ac_build_s_barrier(&ctx->ac);
@@ -490,6 +565,33 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 
 	/* TODO: streamout */
 
+	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), "");
+	ac_build_ifcc(&ctx->ac, tmp, 5110);
+	{
+		LLVMValueRef offset;
+		tmp = tid;
+		if (sel->so.num_outputs)
+			tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), "");
+		offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), "");
+		if (sel->so.num_outputs) {
+			tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), "");
+			tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), "");
+			offset = LLVMBuildAdd(builder, offset, tmp, "");
+		}
+
+		tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+		LLVMValueRef args[] = {
+			tmp,
+			ngg_get_query_buf(ctx),
+			offset,
+			LLVMConstInt(ctx->i32, 16, false), /* soffset */
+			ctx->i32_0, /* cachepolicy */
+		};
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
+				   ctx->i32, args, 5, 0);
+	}
+	ac_build_endif(&ctx->ac, 5110);
+
 	/* TODO: culling */
 
 	/* Determine vertex liveness. */
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index 0ca065f34e0..a362c207776 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -21,6 +21,7 @@
 files_libradeonsi = files(
   'cik_sdma.c',
   'driinfo_radeonsi.h',
+  'gfx10_query.c',
   'gfx10_shader_ngg.c',
   'si_blit.c',
   'si_buffer.c',
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c2cee024982..91b474d4d8f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -168,6 +168,9 @@ static void si_destroy_context(struct pipe_context *context)
 
 	si_release_all_descriptors(sctx);
 
+	if (sctx->chip_class >= GFX10)
+		gfx10_destroy_query(sctx);
+
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
 	pipe_resource_reference(&sctx->tess_rings, NULL);
@@ -239,6 +242,8 @@ static void si_destroy_context(struct pipe_context *context)
 
 	if (sctx->query_result_shader)
 		sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+	if (sctx->sh_query_result_shader)
+		sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
 
 	if (sctx->gfx_cs)
 		sctx->ws->cs_destroy(sctx->gfx_cs);
@@ -516,6 +521,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	/* Initialize graphics-only context functions. */
 	if (sctx->has_graphics) {
 		si_init_context_texture_functions(sctx);
+		if (sctx->chip_class >= GFX10)
+			gfx10_init_query(sctx);
 		si_init_msaa_functions(sctx);
 		si_init_shader_functions(sctx);
 		si_init_state_functions(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a351e5004b1..874b1bf4cd0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -868,6 +868,7 @@ struct si_context {
 	struct pipe_device_reset_callback device_reset_callback;
 	struct u_log_context		*log;
 	void				*query_result_shader;
+	void				*sh_query_result_shader;
 
 	void (*emit_cache_flush)(struct si_context *ctx);
 
@@ -1178,6 +1179,10 @@ struct si_context {
 	unsigned			num_sdma_uploads;
 	unsigned			max_sdma_uploads;
 
+	/* Shader-based queries. */
+	struct list_head		shader_query_buffers;
+	unsigned			num_active_shader_queries;
+
 	/* Statistics gathering for the DCC enablement heuristic. It can't be
 	 * in si_texture because si_texture can be shared by multiple
 	 * contexts. This is for back buffers only. We shouldn't get too many
@@ -1439,6 +1444,11 @@ void *si_clear_render_target_shader(struct pipe_context *ctx);
 void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
 void *si_create_dcc_retile_cs(struct pipe_context *ctx);
 void *si_create_query_result_cs(struct si_context *sctx);
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
+
+/* gfx10_query.c */
+void gfx10_init_query(struct si_context *sctx);
+void gfx10_destroy_query(struct si_context *sctx);
 
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index ae6498e1895..394bf7ff124 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -32,8 +32,6 @@
 #include "util/u_suballoc.h"
 #include "amd/common/sid.h"
 
-#define SI_MAX_STREAMS 4
-
 static const struct si_query_ops query_hw_ops;
 
 struct si_hw_query_params {
@@ -1015,6 +1013,12 @@ static void si_emit_query_predication(struct si_context *ctx)
 	if (!query)
 		return;
 
+	if (ctx->chip_class == GFX10 &&
+	    (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+	     query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+		assert(!"not implemented");
+	}
+
 	invert = ctx->render_cond_invert;
 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
@@ -1096,6 +1100,14 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que
 	     query_type != SI_QUERY_TIME_ELAPSED_SDMA))
 		return si_query_sw_create(query_type);
 
+	if (sscreen->info.chip_class >= GFX10 &&
+	    (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+	     query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
+	     query_type == PIPE_QUERY_SO_STATISTICS ||
+	     query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+	     query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+		return gfx10_sh_query_create(sscreen, query_type, index);
+
 	return si_query_hw_create(sscreen, query_type, index);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 82e5e25ed00..dc219f8551c 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -38,6 +38,8 @@ struct si_query_buffer;
 struct si_query_hw;
 struct si_resource;
 
+#define SI_MAX_STREAMS 4
+
 enum {
 	SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
 	SI_QUERY_DECOMPRESS_CALLS,
@@ -228,6 +230,12 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
 void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
 
 
+/* Shader-based queries */
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
+					 enum pipe_query_type query_type,
+					 unsigned index);
+
+
 /* Performance counters */
 struct si_perfcounters {
 	unsigned num_groups;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9abecdf1003..68506b7a92c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -6082,6 +6082,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 			for (unsigned i = 0; i < 4; ++i) {
 				ctx->gs_curprim_verts[i] =
 					lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
+				ctx->gs_generated_prims[i] =
+					lp_build_alloca(&ctx->gallivm, ctx->ac.i32, "");
 			}
 
 			LLVMTypeRef a8i32 = LLVMArrayType(ctx->i32, 8);
@@ -6135,9 +6137,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 
 			if (ctx->type == PIPE_SHADER_TESS_CTRL ||
 			    ctx->type == PIPE_SHADER_GEOMETRY) {
+				if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+					gfx10_ngg_gs_emit_prologue(ctx);
+					nested_barrier = false;
+				} else {
+					nested_barrier = true;
+				}
+
 				/* Number of patches / primitives */
 				num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
-				nested_barrier = true;
 			} else {
 				/* Number of vertices */
 				num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 09efc91b9f5..7832f75ef65 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -214,6 +214,7 @@ struct si_shader_context {
 	LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
 	LLVMValueRef gs_next_vertex[4];
 	LLVMValueRef gs_curprim_verts[4];
+	LLVMValueRef gs_generated_prims[4];
 	LLVMValueRef gs_ngg_emit;
 	LLVMValueRef gs_ngg_scratch;
 	LLVMValueRef postponed_kill;
@@ -388,6 +389,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
 			      unsigned stream,
 			      LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
 
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index b68fd2ff236..9f2f9d30216 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -660,3 +660,228 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
 
 	return ctx->create_compute_state(ctx, &state);
 }
+
+/* Create the compute shader that is used to collect the results of gfx10+
+ * shader queries.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ *
+ * CONST
+ *  0.x = config; the low 3 bits indicate the mode:
+ *          0: sum up counts
+ *          1: determine result availability and write it as a boolean
+ *          2: SO_OVERFLOW
+ *          3: SO_ANY_OVERFLOW
+ *        the remaining bits form a bitfield:
+ *          8: write result as a 64-bit value
+ *  0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
+ *  0.z = chain bit field:
+ *          1: have previous summary buffer
+ *          2: write next summary buffer
+ *  0.w = result_count
+ */
+void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
+{
+	/* TEMP[0].x = accumulated result so far
+	 * TEMP[0].y = result missing
+	 * TEMP[0].z = whether we're in overflow mode
+	 */
+	static const char text_tmpl[] =
+		"COMP\n"
+		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+		"DCL BUFFER[0]\n"
+		"DCL BUFFER[1]\n"
+		"DCL BUFFER[2]\n"
+		"DCL CONST[0][0..0]\n"
+		"DCL TEMP[0..5]\n"
+		"IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+		"IMM[1] UINT32 {1, 2, 4, 8}\n"
+		"IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+		/*
+		acc_result = 0;
+		acc_missing = 0;
+		if (chain & 1) {
+			acc_result = buffer[1][0];
+			acc_missing = buffer[1][1];
+		}
+		*/
+		"MOV TEMP[0].xy, IMM[0].xxxx\n"
+		"AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+		"UIF TEMP[5]\n"
+			"LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+		"ENDIF\n"
+
+		/*
+		is_overflow (TEMP[0].z) = (config & 7) >= 2;
+		result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
+		base_offset (TEMP[1].y) = 0;
+		for (;;) {
+			if (!result_remaining)
+				break;
+			result_remaining--;
+		*/
+		"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+		"USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+		"AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+		"UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+		"MOV TEMP[1].y, IMM[0].xxxx\n"
+
+		"BGNLOOP\n"
+			"USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"BRK\n"
+			"ENDIF\n"
+			"UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+			/*
+			fence = buffer[0]@(base_offset + 32);
+			if (!fence) {
+				acc_missing = ~0u;
+				break;
+			}
+			*/
+			"UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+			"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+			"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"MOV TEMP[0].y, TEMP[5].xxxx\n"
+				"BRK\n"
+			"ENDIF\n"
+
+			/*
+			stream_offset (TEMP[2].x) = base_offset + offset;
+
+			if (!(config & 7)) {
+				acc_result += buffer[0]@stream_offset;
+			}
+			*/
+			"UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+			"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+			"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+				"UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+			"ENDIF\n"
+
+			/*
+			if ((config & 7) >= 2) {
+				count (TEMP[2].y) = (config & 1) ? 4 : 1;
+			*/
+			"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+			"USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+			"UIF TEMP[5]\n"
+				"AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+				"UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+				/*
+				do {
+					generated = buffer[0]@stream_offset;
+					emitted = buffer[0]@(stream_offset + 16);
+					if (generated != emitted) {
+						acc_result = 1;
+						result_remaining = 0;
+						break;
+					}
+
+					stream_offset += 4;
+				} while (--count);
+				*/
+				"BGNLOOP\n"
+					"UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+					"LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+					"LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+					"USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+					"UIF TEMP[5]\n"
+						"MOV TEMP[0].x, IMM[1].xxxx\n"
+						"MOV TEMP[1].y, IMM[0].xxxx\n"
+						"BRK\n"
+					"ENDIF\n"
+
+					"UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+					"USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+					"UIF TEMP[5]\n"
+						"BRK\n"
+					"ENDIF\n"
+					"UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+				"ENDLOOP\n"
+			"ENDIF\n"
+
+		/*
+			base_offset += 64;
+		} // end outer loop
+		*/
+			"UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+		"ENDLOOP\n"
+
+		/*
+		if (chain & 2) {
+			buffer[2][0] = acc_result;
+			buffer[2][1] = acc_missing;
+		} else {
+		*/
+		"AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+		"UIF TEMP[5]\n"
+			"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+		"ELSE\n"
+
+			/*
+			if ((config & 7) == 1) {
+				acc_result = acc_missing ? 0 : 1;
+				acc_missing = 0;
+			}
+			*/
+			"AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+			"USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+			"UIF TEMP[5]\n"
+				"UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+				"MOV TEMP[0].y, IMM[0].xxxx\n"
+			"ENDIF\n"
+
+			/*
+			if (!acc_missing) {
+				buffer[2][0] = acc_result;
+				if (config & 8)
+					buffer[2][1] = 0;
+			}
+			*/
+			"USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+			"UIF TEMP[5]\n"
+				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+				"AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+				"UIF TEMP[5]\n"
+					"STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+				"ENDIF\n"
+			"ENDIF\n"
+		"ENDIF\n"
+
+		"END\n";
+
+	struct tgsi_token tokens[1024];
+	struct pipe_compute_state state = {};
+
+	if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+		assert(false);
+		return NULL;
+	}
+
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = tokens;
+
+	return sctx->b.create_compute_state(&sctx->b, &state);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 678f87cd73d..757dd1bf5cd 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -228,6 +228,7 @@ union si_state_atoms {
 		struct si_atom spi_map;
 		struct si_atom scratch_state;
 		struct si_atom window_rectangles;
+		struct si_atom shader_query;
 	} s;
 	struct si_atom array[0];
 };
@@ -370,6 +371,8 @@ enum {
 	SI_PS_IMAGE_COLORBUF0_FMASK,
 	SI_PS_IMAGE_COLORBUF0_FMASK_HI,
 
+	GFX10_GS_QUERY_BUF,
+
 	SI_NUM_RW_BUFFERS,
 };