radeonsi: remove redundant no-signed-zero-fp-math LLVM attribute
[mesa.git] / src / gallium / drivers / radeonsi / gfx10_query.c
index c0a0bc8ce57fe70edfa7ccd9c8950f196b5dfdf2..aedf5090eed1b9ac7151c86d3fd543c51587155f 100644 (file)
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <stddef.h>
-
 #include "si_pipe.h"
 #include "si_query.h"
+#include "sid.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
-#include "sid.h"
+
+#include <stddef.h>
 
 /**
  * The query buffer is written to by ESGS NGG shaders with statistics about
  * without additional GPU cost.
  */
 struct gfx10_sh_query_buffer {
-       struct list_head list;
-       struct si_resource *buf;
-       unsigned refcount;
+   struct list_head list;
+   struct si_resource *buf;
+   unsigned refcount;
 
-       /* Offset into the buffer in bytes; points at the first un-emitted entry. */
-       unsigned head;
+   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+   unsigned head;
 };
 
 /* Memory layout of the query buffer. Must be kept in sync with shaders
@@ -55,469 +55,454 @@ struct gfx10_sh_query_buffer {
  * of all those values unconditionally.
  */
 struct gfx10_sh_query_buffer_mem {
-       struct {
-               uint64_t generated_primitives_start_dummy;
-               uint64_t emitted_primitives_start_dummy;
-               uint64_t generated_primitives;
-               uint64_t emitted_primitives;
-       } stream[4];
-       uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
-       uint32_t pad[31];
+   struct {
+      uint64_t generated_primitives_start_dummy;
+      uint64_t emitted_primitives_start_dummy;
+      uint64_t generated_primitives;
+      uint64_t emitted_primitives;
+   } stream[4];
+   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+   uint32_t pad[31];
 };
 
 /* Shader-based queries. */
 struct gfx10_sh_query {
-       struct si_query b;
+   struct si_query b;
 
-       struct gfx10_sh_query_buffer *first;
-       struct gfx10_sh_query_buffer *last;
-       unsigned first_begin;
-       unsigned last_end;
+   struct gfx10_sh_query_buffer *first;
+   struct gfx10_sh_query_buffer *last;
+   unsigned first_begin;
+   unsigned last_end;
 
-       unsigned stream;
+   unsigned stream;
 };
 
 static void emit_shader_query(struct si_context *sctx)
 {
-       assert(!list_is_empty(&sctx->shader_query_buffers));
+   assert(!list_is_empty(&sctx->shader_query_buffers));
 
-       struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
-                                                            struct gfx10_sh_query_buffer, list);
-       qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+   struct gfx10_sh_query_buffer *qbuf =
+      list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
 }
 
 static void gfx10_release_query_buffers(struct si_context *sctx,
-                                       struct gfx10_sh_query_buffer *first,
-                                       struct gfx10_sh_query_buffer *last)
+                                        struct gfx10_sh_query_buffer *first,
+                                        struct gfx10_sh_query_buffer *last)
 {
-       while (first) {
-               struct gfx10_sh_query_buffer *qbuf = first;
-               if (first != last)
-                       first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-               else
-                       first = NULL;
-
-               qbuf->refcount--;
-               if (qbuf->refcount)
-                       continue;
-
-               if (qbuf->list.next == &sctx->shader_query_buffers)
-                       continue; /* keep the most recent buffer; it may not be full yet */
-               if (qbuf->list.prev == &sctx->shader_query_buffers)
-                       continue; /* keep the oldest buffer for recycling */
-
-               list_del(&qbuf->list);
-               si_resource_reference(&qbuf->buf, NULL);
-               FREE(qbuf);
-       }
+   while (first) {
+      struct gfx10_sh_query_buffer *qbuf = first;
+      if (first != last)
+         first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+      else
+         first = NULL;
+
+      qbuf->refcount--;
+      if (qbuf->refcount)
+         continue;
+
+      if (qbuf->list.next == &sctx->shader_query_buffers)
+         continue; /* keep the most recent buffer; it may not be full yet */
+      if (qbuf->list.prev == &sctx->shader_query_buffers)
+         continue; /* keep the oldest buffer for recycling */
+
+      list_del(&qbuf->list);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 }
 
 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
 {
-       if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
-               return true;
-
-       struct gfx10_sh_query_buffer *qbuf = NULL;
-
-       if (!list_is_empty(&sctx->shader_query_buffers)) {
-               qbuf = list_last_entry(&sctx->shader_query_buffers,
-                                      struct gfx10_sh_query_buffer, list);
-               if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
-                       goto success;
-
-               qbuf = list_first_entry(&sctx->shader_query_buffers,
-                                       struct gfx10_sh_query_buffer, list);
-               if (!qbuf->refcount &&
-                   !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
-                   sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
-                       /* Can immediately re-use the oldest buffer */
-                       list_del(&qbuf->list);
-               } else {
-                       qbuf = NULL;
-               }
-       }
-
-       if (!qbuf) {
-               qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
-               if (unlikely(!qbuf))
-                       return false;
-
-               struct si_screen *screen = sctx->screen;
-               unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
-                                        screen->info.min_alloc_size);
-               qbuf->buf = si_resource(
-                       pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-               if (unlikely(!qbuf->buf)) {
-                       FREE(qbuf);
-                       return false;
-               }
-       }
-
-       /* The buffer is currently unused by the GPU. Initialize it.
-        *
-        * We need to set the high bit of all the primitive counters for
-        * compatibility with the SET_PREDICATION packet.
-        */
-       uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
-                                                PIPE_TRANSFER_WRITE |
-                                                PIPE_TRANSFER_UNSYNCHRONIZED);
-       assert(results);
-
-       for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
-            i < e; ++i) {
-               for (unsigned j = 0; j < 16; ++j)
-                       results[32 * i + j] = (uint64_t)1 << 63;
-               results[32 * i + 16] = 0;
-       }
-
-       list_addtail(&qbuf->list, &sctx->shader_query_buffers);
-       qbuf->head = 0;
-       qbuf->refcount = sctx->num_active_shader_queries;
+   if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+      return true;
+
+   struct gfx10_sh_query_buffer *qbuf = NULL;
+
+   if (!list_is_empty(&sctx->shader_query_buffers)) {
+      qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+         goto success;
+
+      qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (!qbuf->refcount &&
+          !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+          sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+         /* Can immediately re-use the oldest buffer */
+         list_del(&qbuf->list);
+      } else {
+         qbuf = NULL;
+      }
+   }
+
+   if (!qbuf) {
+      qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+      if (unlikely(!qbuf))
+         return false;
+
+      struct si_screen *screen = sctx->screen;
+      unsigned buf_size =
+         MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
+      qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+      if (unlikely(!qbuf->buf)) {
+         FREE(qbuf);
+         return false;
+      }
+   }
+
+   /* The buffer is currently unused by the GPU. Initialize it.
+    *
+    * We need to set the high bit of all the primitive counters for
+    * compatibility with the SET_PREDICATION packet.
+    */
+   uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+                                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+   assert(results);
+
+   for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
+        ++i) {
+      for (unsigned j = 0; j < 16; ++j)
+         results[32 * i + j] = (uint64_t)1 << 63;
+      results[32 * i + 16] = 0;
+   }
+
+   list_addtail(&qbuf->list, &sctx->shader_query_buffers);
+   qbuf->head = 0;
+   qbuf->refcount = sctx->num_active_shader_queries;
 
 success:;
-       struct pipe_shader_buffer sbuf;
-       sbuf.buffer = &qbuf->buf->b.b;
-       sbuf.buffer_offset = qbuf->head;
-       sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
-       si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
-       sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
-
-       si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
-       return true;
+   struct pipe_shader_buffer sbuf;
+   sbuf.buffer = &qbuf->buf->b.b;
+   sbuf.buffer_offset = qbuf->head;
+   sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+   si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+   sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+   return true;
 }
 
 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
 {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-       gfx10_release_query_buffers(sctx, query->first, query->last);
-       FREE(query);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   FREE(query);
 }
 
 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
 {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
 
-       gfx10_release_query_buffers(sctx, query->first, query->last);
-       query->first = query->last = NULL;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   query->first = query->last = NULL;
 
-       if (unlikely(!gfx10_alloc_query_buffer(sctx)))
-               return false;
+   if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+      return false;
 
-       query->first = list_last_entry(&sctx->shader_query_buffers,
-                                      struct gfx10_sh_query_buffer, list);
-       query->first_begin = query->first->head;
+   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->first_begin = query->first->head;
 
-       sctx->num_active_shader_queries++;
-       query->first->refcount++;
+   sctx->num_active_shader_queries++;
+   query->first->refcount++;
 
-       return true;
+   return true;
 }
 
 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
 {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-
-       if (unlikely(!query->first))
-               return false; /* earlier out of memory error */
-
-       query->last = list_last_entry(&sctx->shader_query_buffers,
-                                     struct gfx10_sh_query_buffer, list);
-       query->last_end = query->last->head;
-
-       /* Signal the fence of the previous chunk */
-       if (query->last_end != 0) {
-               uint64_t fence_va = query->last->buf->gpu_address;
-               fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
-               fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-               si_cp_release_mem(sctx, sctx->gfx_cs,
-                                 V_028A90_BOTTOM_OF_PIPE_TS, 0,
-                                 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                                 EOP_DATA_SEL_VALUE_32BIT,
-                                 query->last->buf, fence_va, 0xffffffff,
-                                 PIPE_QUERY_GPU_FINISHED);
-       }
-
-       sctx->num_active_shader_queries--;
-
-       if (sctx->num_active_shader_queries > 0) {
-               gfx10_alloc_query_buffer(sctx);
-       } else {
-               si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
-               sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
-
-               /* If a query_begin is followed by a query_end without a draw
-                * in-between, we need to clear the atom to ensure that the
-                * next query_begin will re-initialize the shader buffer. */
-               si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
-       }
-
-       return true;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+
+   query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->last_end = query->last->head;
+
+   /* Signal the fence of the previous chunk */
+   if (query->last_end != 0) {
+      uint64_t fence_va = query->last->buf->gpu_address;
+      fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+      fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
+                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
+   }
+
+   sctx->num_active_shader_queries--;
+
+   if (sctx->num_active_shader_queries > 0) {
+      gfx10_alloc_query_buffer(sctx);
+   } else {
+      si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+      sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
+
+      /* If a query_begin is followed by a query_end without a draw
+       * in-between, we need to clear the atom to ensure that the
+       * next query_begin will re-initialize the shader buffer. */
+      si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+   }
+
+   return true;
 }
 
 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
-                                     struct gfx10_sh_query_buffer_mem *qmem,
-                                     union pipe_query_result *result)
+                                      struct gfx10_sh_query_buffer_mem *qmem,
+                                      union pipe_query_result *result)
 {
-       static const uint64_t mask = ((uint64_t)1 << 63) - 1;
-
-       switch (query->b.type) {
-       case PIPE_QUERY_PRIMITIVES_EMITTED:
-               result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
-               break;
-       case PIPE_QUERY_PRIMITIVES_GENERATED:
-               result->u64 += qmem->stream[query->stream].generated_primitives & mask;
-               break;
-       case PIPE_QUERY_SO_STATISTICS:
-               result->so_statistics.num_primitives_written +=
-                       qmem->stream[query->stream].emitted_primitives & mask;
-               result->so_statistics.primitives_storage_needed +=
-                       qmem->stream[query->stream].generated_primitives & mask;
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-               result->b |= qmem->stream[query->stream].emitted_primitives !=
-                            qmem->stream[query->stream].generated_primitives;
-               break;
-       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-                       result->b |= qmem->stream[query->stream].emitted_primitives !=
-                                    qmem->stream[query->stream].generated_primitives;
-               }
-               break;
-       default:
-               assert(0);
-       }
+   static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      result->so_statistics.num_primitives_written +=
+         qmem->stream[query->stream].emitted_primitives & mask;
+      result->so_statistics.primitives_storage_needed +=
+         qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result->b |= qmem->stream[query->stream].emitted_primitives !=
+                   qmem->stream[query->stream].generated_primitives;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+         result->b |= qmem->stream[query->stream].emitted_primitives !=
+                      qmem->stream[query->stream].generated_primitives;
+      }
+      break;
+   default:
+      assert(0);
+   }
 }
 
-static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
-                                     bool wait, union pipe_query_result *result)
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
+                                      union pipe_query_result *result)
 {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
 
-       util_query_clear_result(result, query->b.type);
+   util_query_clear_result(result, query->b.type);
 
-       if (unlikely(!query->first))
-               return false; /* earlier out of memory error */
-       assert(query->last);
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+   assert(query->last);
 
-       for (struct gfx10_sh_query_buffer *qbuf = query->last;;
-            qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
-               unsigned usage = PIPE_TRANSFER_READ |
-                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-               void *map;
+   for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+        qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      void *map;
 
-               if (rquery->b.flushed)
-                       map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-               else
-                       map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+      if (rquery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
 
-               if (!map)
-                       return false;
+      if (!map)
+         return false;
 
-               unsigned results_begin = 0;
-               unsigned results_end = qbuf->head;
-               if (qbuf == query->first)
-                       results_begin = query->first_begin;
-               if (qbuf == query->last)
-                       results_end = query->last_end;
+      unsigned results_begin = 0;
+      unsigned results_end = qbuf->head;
+      if (qbuf == query->first)
+         results_begin = query->first_begin;
+      if (qbuf == query->last)
+         results_end = query->last_end;
 
-               while (results_begin != results_end) {
-                       struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
-                       results_begin += sizeof(*qmem);
+      while (results_begin != results_end) {
+         struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+         results_begin += sizeof(*qmem);
 
-                       gfx10_sh_query_add_result(query, qmem, result);
-               }
+         gfx10_sh_query_add_result(query, qmem, result);
+      }
 
-               if (qbuf == query->first)
-                       break;
-       }
+      if (qbuf == query->first)
+         break;
+   }
 
-       return true;
+   return true;
 }
 
-static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
-                                              struct si_query *rquery,
-                                              bool wait,
-                                              enum pipe_query_value_type result_type,
-                                              int index,
-                                              struct pipe_resource *resource,
-                                              unsigned offset)
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
+                                               bool wait, enum pipe_query_value_type result_type,
+                                               int index, struct pipe_resource *resource,
+                                               unsigned offset)
 {
-       struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-       struct si_qbo_state saved_state = {};
-       struct pipe_resource *tmp_buffer = NULL;
-       unsigned tmp_buffer_offset = 0;
-
-       if (!sctx->sh_query_result_shader) {
-               sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
-               if (!sctx->sh_query_result_shader)
-                       return;
-       }
-
-       if (query->first != query->last) {
-               u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
-                                    &tmp_buffer_offset, &tmp_buffer);
-               if (!tmp_buffer)
-                       return;
-       }
-
-       si_save_qbo_state(sctx, &saved_state);
-
-       /* Pre-fill the constants configuring the shader behavior. */
-       struct {
-               uint32_t config;
-               uint32_t offset;
-               uint32_t chain;
-               uint32_t result_count;
-       } consts;
-       struct pipe_constant_buffer constant_buffer = {};
-
-       if (index >= 0) {
-               switch (query->b.type) {
-               case PIPE_QUERY_PRIMITIVES_GENERATED:
-                       consts.offset = sizeof(uint32_t) * query->stream;
-                       consts.config = 0;
-                       break;
-               case PIPE_QUERY_PRIMITIVES_EMITTED:
-                       consts.offset = sizeof(uint32_t) * (4 + query->stream);
-                       consts.config = 0;
-                       break;
-               case PIPE_QUERY_SO_STATISTICS:
-                       consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
-                       consts.config = 0;
-                       break;
-               case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-                       consts.offset = sizeof(uint32_t) * query->stream;
-                       consts.config = 2;
-                       break;
-               case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-                       consts.offset = 0;
-                       consts.config = 3;
-                       break;
-               default: unreachable("bad query type");
-               }
-       } else {
-               /* Check result availability. */
-               consts.offset = 0;
-               consts.config = 1;
-       }
-
-       if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
-               consts.config |= 8;
-
-       constant_buffer.buffer_size = sizeof(consts);
-       constant_buffer.user_buffer = &consts;
-
-       /* Pre-fill the SSBOs and grid. */
-       struct pipe_shader_buffer ssbo[3];
-       struct pipe_grid_info grid = {};
-
-       ssbo[1].buffer = tmp_buffer;
-       ssbo[1].buffer_offset = tmp_buffer_offset;
-       ssbo[1].buffer_size = 16;
-
-       ssbo[2] = ssbo[1];
-
-       sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
-       grid.block[0] = 1;
-       grid.block[1] = 1;
-       grid.block[2] = 1;
-       grid.grid[0] = 1;
-       grid.grid[1] = 1;
-       grid.grid[2] = 1;
-
-       struct gfx10_sh_query_buffer *qbuf = query->first;
-       for (;;) {
-               unsigned begin = qbuf == query->first ? query->first_begin : 0;
-               unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
-               if (!end)
-                       continue;
-
-               ssbo[0].buffer = &qbuf->buf->b.b;
-               ssbo[0].buffer_offset = begin;
-               ssbo[0].buffer_size = end - begin;
-
-               consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
-               consts.chain = 0;
-               if (qbuf != query->first)
-                       consts.chain |= 1;
-               if (qbuf != query->last)
-                       consts.chain |= 2;
-
-               if (qbuf == query->last) {
-                       ssbo[2].buffer = resource;
-                       ssbo[2].buffer_offset = offset;
-                       ssbo[2].buffer_size = 8;
-               }
-
-               sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-               sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
-
-               if (wait) {
-                       uint64_t va;
-
-                       /* Wait for result availability. Wait only for readiness
-                        * of the last entry, since the fence writes should be
-                        * serialized in the CP.
-                        */
-                       va = qbuf->buf->gpu_address;
-                       va += end - sizeof(struct gfx10_sh_query_buffer_mem);
-                       va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-
-                       si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
-               }
-
-               sctx->b.launch_grid(&sctx->b, &grid);
-               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-               if (qbuf == query->last)
-                       break;
-               qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-       }
-
-       si_restore_qbo_state(sctx, &saved_state);
-       pipe_resource_reference(&tmp_buffer, NULL);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct si_qbo_state saved_state = {};
+   struct pipe_resource *tmp_buffer = NULL;
+   unsigned tmp_buffer_offset = 0;
+
+   if (!sctx->sh_query_result_shader) {
+      sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+      if (!sctx->sh_query_result_shader)
+         return;
+   }
+
+   if (query->first != query->last) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+      if (!tmp_buffer)
+         return;
+   }
+
+   si_save_qbo_state(sctx, &saved_state);
+
+   /* Pre-fill the constants configuring the shader behavior. */
+   struct {
+      uint32_t config;
+      uint32_t offset;
+      uint32_t chain;
+      uint32_t result_count;
+   } consts;
+   struct pipe_constant_buffer constant_buffer = {};
+
+   if (index >= 0) {
+      switch (query->b.type) {
+      case PIPE_QUERY_PRIMITIVES_GENERATED:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_PRIMITIVES_EMITTED:
+         consts.offset = sizeof(uint32_t) * (4 + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_STATISTICS:
+         consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 2;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         consts.offset = 0;
+         consts.config = 3;
+         break;
+      default:
+         unreachable("bad query type");
+      }
+   } else {
+      /* Check result availability. */
+      consts.offset = 0;
+      consts.config = 1;
+   }
+
+   if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+      consts.config |= 8;
+
+   constant_buffer.buffer_size = sizeof(consts);
+   constant_buffer.user_buffer = &consts;
+
+   /* Pre-fill the SSBOs and grid. */
+   struct pipe_shader_buffer ssbo[3];
+   struct pipe_grid_info grid = {};
+
+   ssbo[1].buffer = tmp_buffer;
+   ssbo[1].buffer_offset = tmp_buffer_offset;
+   ssbo[1].buffer_size = 16;
+
+   ssbo[2] = ssbo[1];
+
+   sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+   grid.block[0] = 1;
+   grid.block[1] = 1;
+   grid.block[2] = 1;
+   grid.grid[0] = 1;
+   grid.grid[1] = 1;
+   grid.grid[2] = 1;
+
+   struct gfx10_sh_query_buffer *qbuf = query->first;
+   for (;;) {
+      unsigned begin = qbuf == query->first ? query->first_begin : 0;
+      unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+      if (!end)
+         continue;
+
+      ssbo[0].buffer = &qbuf->buf->b.b;
+      ssbo[0].buffer_offset = begin;
+      ssbo[0].buffer_size = end - begin;
+
+      consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+      consts.chain = 0;
+      if (qbuf != query->first)
+         consts.chain |= 1;
+      if (qbuf != query->last)
+         consts.chain |= 2;
+
+      if (qbuf == query->last) {
+         ssbo[2].buffer = resource;
+         ssbo[2].buffer_offset = offset;
+         ssbo[2].buffer_size = 8;
+      }
+
+      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+      if (wait) {
+         uint64_t va;
+
+         /* Wait for result availability. Wait only for readiness
+          * of the last entry, since the fence writes should be
+          * serialized in the CP.
+          */
+         va = qbuf->buf->gpu_address;
+         va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+         va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+      }
+
+      sctx->b.launch_grid(&sctx->b, &grid);
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+      if (qbuf == query->last)
+         break;
+      qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+   }
+
+   si_restore_qbo_state(sctx, &saved_state);
+   pipe_resource_reference(&tmp_buffer, NULL);
 }
 
 static const struct si_query_ops gfx10_sh_query_ops = {
-       .destroy = gfx10_sh_query_destroy,
-       .begin = gfx10_sh_query_begin,
-       .end = gfx10_sh_query_end,
-       .get_result = gfx10_sh_query_get_result,
-       .get_result_resource = gfx10_sh_query_get_result_resource,
+   .destroy = gfx10_sh_query_destroy,
+   .begin = gfx10_sh_query_begin,
+   .end = gfx10_sh_query_end,
+   .get_result = gfx10_sh_query_get_result,
+   .get_result_resource = gfx10_sh_query_get_result_resource,
 };
 
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-                                        enum pipe_query_type query_type,
-                                        unsigned index)
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index)
 {
-       struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
-       if (unlikely(!query))
-               return NULL;
+   struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+   if (unlikely(!query))
+      return NULL;
 
-       query->b.ops = &gfx10_sh_query_ops;
-       query->b.type = query_type;
-       query->stream = index;
+   query->b.ops = &gfx10_sh_query_ops;
+   query->b.type = query_type;
+   query->stream = index;
 
-       return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
 }
 
 void gfx10_init_query(struct si_context *sctx)
 {
-       list_inithead(&sctx->shader_query_buffers);
-       sctx->atoms.s.shader_query.emit = emit_shader_query;
+   list_inithead(&sctx->shader_query_buffers);
+   sctx->atoms.s.shader_query.emit = emit_shader_query;
 }
 
 void gfx10_destroy_query(struct si_context *sctx)
 {
-       while (!list_is_empty(&sctx->shader_query_buffers)) {
-               struct gfx10_sh_query_buffer *qbuf =
-                       list_first_entry(&sctx->shader_query_buffers,
-                                        struct gfx10_sh_query_buffer, list);
-               list_del(&qbuf->list);
-
-               assert(!qbuf->refcount);
-               si_resource_reference(&qbuf->buf, NULL);
-               FREE(qbuf);
-       }
+   while (!list_is_empty(&sctx->shader_query_buffers)) {
+      struct gfx10_sh_query_buffer *qbuf =
+         list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      list_del(&qbuf->list);
+
+      assert(!qbuf->refcount);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 }