+#include "util/u_memory.h"
+
+static bool r600_is_timer_query(unsigned type)
+{
+ return type == PIPE_QUERY_TIME_ELAPSED ||
+ type == PIPE_QUERY_TIMESTAMP ||
+ type == PIPE_QUERY_TIMESTAMP_DISJOINT;
+}
+
+static bool r600_query_needs_begin(unsigned type)
+{
+ return type != PIPE_QUERY_GPU_FINISHED &&
+ type != PIPE_QUERY_TIMESTAMP;
+}
+
+static struct r600_resource *r600_new_query_buffer(struct r600_context *ctx, unsigned type)
+{
+ unsigned j, i, num_results, buf_size = 4096;
+ uint32_t *results;
+
+ /* Non-GPU queries. */
+ switch (type) {
+ case R600_QUERY_DRAW_CALLS:
+ case R600_QUERY_REQUESTED_VRAM:
+ case R600_QUERY_REQUESTED_GTT:
+ case R600_QUERY_BUFFER_WAIT_TIME:
+ return NULL;
+ }
+
+ /* Queries are normally read by the CPU after
+ * being written by the gpu, hence staging is probably a good
+ * usage pattern.
+ */
+ struct r600_resource *buf = (struct r600_resource*)
+ pipe_buffer_create(&ctx->screen->b.b, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_STAGING, buf_size);
+
+ switch (type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ results = r600_buffer_mmap_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
+ memset(results, 0, buf_size);
+
+ /* Set top bits for unused backends. */
+ num_results = buf_size / (16 * ctx->max_db);
+ for (j = 0; j < num_results; j++) {
+ for (i = 0; i < ctx->max_db; i++) {
+ if (!(ctx->backend_mask & (1<<i))) {
+ results[(i * 4)+1] = 0x80000000;
+ results[(i * 4)+3] = 0x80000000;
+ }
+ }
+ results += 4 * ctx->max_db;
+ }
+ ctx->b.ws->buffer_unmap(buf->cs_buf);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ case PIPE_QUERY_TIMESTAMP:
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ results = r600_buffer_mmap_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
+ memset(results, 0, buf_size);
+ ctx->b.ws->buffer_unmap(buf->cs_buf);
+ break;
+ default:
+ assert(0);
+ }
+ return buf;
+}
+
+static void r600_update_occlusion_query_state(struct r600_context *rctx,
+ unsigned type, int diff)
+{
+ if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ bool enable;
+
+ rctx->num_occlusion_queries += diff;
+ assert(rctx->num_occlusion_queries >= 0);
+
+ enable = rctx->num_occlusion_queries != 0;
+
+ if (rctx->db_misc_state.occlusion_query_enabled != enable) {
+ rctx->db_misc_state.occlusion_query_enabled = enable;
+ rctx->db_misc_state.atom.dirty = true;
+ }
+ }
+}
+
+static void r600_emit_query_begin(struct r600_context *ctx, struct r600_query *query)
+{
+ struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+ uint64_t va;
+
+ r600_update_occlusion_query_state(ctx, query->type, 1);
+ r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
+
+ /* Get a new query buffer if needed. */
+ if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
+ struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
+ *qbuf = query->buffer;
+ query->buffer.buf = r600_new_query_buffer(ctx, query->type);
+ query->buffer.results_end = 0;
+ query->buffer.previous = qbuf;
+ }
+
+ /* emit begin query */
+ va = r600_resource_va(&ctx->screen->b.b, (void*)query->buffer.buf);
+ va += query->buffer.results_end;
+
+ switch (query->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
+ cs->buf[cs->cdw++] = 0;
+ cs->buf[cs->cdw++] = 0;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ if (!ctx->num_pipelinestat_queries) {
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0);
+ }
+ ctx->num_pipelinestat_queries++;
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+ break;
+ default:
+ assert(0);
+ }
+ cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+ cs->buf[cs->cdw++] = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE);
+
+ if (!r600_is_timer_query(query->type)) {
+ ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
+ }
+}
+
+static void r600_emit_query_end(struct r600_context *ctx, struct r600_query *query)
+{
+ struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+ uint64_t va;
+
+ /* The queries which need begin already called this in begin_query. */
+ if (!r600_query_needs_begin(query->type)) {
+ r600_need_cs_space(ctx, query->num_cs_dw, FALSE);
+ }
+
+ va = r600_resource_va(&ctx->screen->b.b, (void*)query->buffer.buf);
+ /* emit end query */
+ switch (query->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ va += query->buffer.results_end + 8;
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ va += query->buffer.results_end + query->result_size/2;
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ va += query->buffer.results_end + query->result_size/2;
+ /* fall through */
+ case PIPE_QUERY_TIMESTAMP:
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
+ cs->buf[cs->cdw++] = 0;
+ cs->buf[cs->cdw++] = 0;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ assert(ctx->num_pipelinestat_queries > 0);
+ ctx->num_pipelinestat_queries--;
+ if (!ctx->num_pipelinestat_queries) {
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) | EVENT_INDEX(0);
+ }
+ va += query->buffer.results_end + query->result_size/2;
+ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+ cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2);
+ cs->buf[cs->cdw++] = va;
+ cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+ break;
+ default:
+ assert(0);
+ }
+ cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+ cs->buf[cs->cdw++] = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE);
+
+ query->buffer.results_end += query->result_size;
+
+ if (r600_query_needs_begin(query->type)) {
+ if (!r600_is_timer_query(query->type)) {
+ ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
+ }
+ }
+
+ r600_update_occlusion_query_state(ctx, query->type, -1);
+}
+
+static void r600_emit_query_predication(struct r600_context *ctx, struct r600_query *query,
+ int operation, bool flag_wait)
+{
+ struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+
+ if (operation == PREDICATION_OP_CLEAR) {
+ r600_need_cs_space(ctx, 3, FALSE);
+
+ cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
+ cs->buf[cs->cdw++] = 0;
+ cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
+ } else {
+ struct r600_query_buffer *qbuf;
+ unsigned count;
+ uint32_t op;
+
+ /* Find how many results there are. */
+ count = 0;
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ count += qbuf->results_end / query->result_size;
+ }
+
+ r600_need_cs_space(ctx, 5 * count, TRUE);
+
+ op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
+ (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
+
+ /* emit predicate packets for all data blocks */
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned results_base = 0;
+ uint64_t va = r600_resource_va(&ctx->screen->b.b, &qbuf->buf->b.b);
+
+ while (results_base < qbuf->results_end) {
+ cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
+ cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
+ cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
+ cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+ cs->buf[cs->cdw++] = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, qbuf->buf, RADEON_USAGE_READ);
+ results_base += query->result_size;
+
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
+ }
+ } while (qbuf);
+ }
+}