r600g: move streamout state to drivers/radeon
[mesa.git] / src / gallium / drivers / r600 / r600_query.c
index 782ad26e3b26a275e285f98fbf6484be737c5f9a..457c9ad39038c6a9ad19ef6005aa0f0407eaa8ae 100644 (file)
@@ -41,12 +41,22 @@ static struct r600_resource *r600_new_query_buffer(struct r600_context *ctx, uns
 {
        unsigned j, i, num_results, buf_size = 4096;
        uint32_t *results;
+
+       /* Non-GPU queries. */
+       switch (type) {
+       case R600_QUERY_DRAW_CALLS:
+       case R600_QUERY_REQUESTED_VRAM:
+       case R600_QUERY_REQUESTED_GTT:
+       case R600_QUERY_BUFFER_WAIT_TIME:
+               return NULL;
+       }
+
        /* Queries are normally read by the CPU after
         * being written by the gpu, hence staging is probably a good
         * usage pattern.
         */
        struct r600_resource *buf = (struct r600_resource*)
-               pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM,
+               pipe_buffer_create(&ctx->screen->b.b, PIPE_BIND_CUSTOM,
                                   PIPE_USAGE_STAGING, buf_size);
 
        switch (type) {
@@ -66,7 +76,7 @@ static struct r600_resource *r600_new_query_buffer(struct r600_context *ctx, uns
                        }
                        results += 4 * ctx->max_db;
                }
-               ctx->ws->buffer_unmap(buf->cs_buf);
+               ctx->b.ws->buffer_unmap(buf->cs_buf);
                break;
        case PIPE_QUERY_TIME_ELAPSED:
        case PIPE_QUERY_TIMESTAMP:
@@ -75,9 +85,10 @@ static struct r600_resource *r600_new_query_buffer(struct r600_context *ctx, uns
        case PIPE_QUERY_PRIMITIVES_GENERATED:
        case PIPE_QUERY_SO_STATISTICS:
        case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+       case PIPE_QUERY_PIPELINE_STATISTICS:
                results = r600_buffer_mmap_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
                memset(results, 0, buf_size);
-               ctx->ws->buffer_unmap(buf->cs_buf);
+               ctx->b.ws->buffer_unmap(buf->cs_buf);
                break;
        default:
                assert(0);
@@ -106,7 +117,7 @@ static void r600_update_occlusion_query_state(struct r600_context *rctx,
 
 static void r600_emit_query_begin(struct r600_context *ctx, struct r600_query *query)
 {
-       struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+       struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
        uint64_t va;
 
        r600_update_occlusion_query_state(ctx, query->type, 1);
@@ -122,7 +133,7 @@ static void r600_emit_query_begin(struct r600_context *ctx, struct r600_query *q
        }
 
        /* emit begin query */
-       va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer.buf);
+       va = r600_resource_va(&ctx->screen->b.b, (void*)query->buffer.buf);
        va += query->buffer.results_end;
 
        switch (query->type) {
@@ -150,11 +161,22 @@ static void r600_emit_query_begin(struct r600_context *ctx, struct r600_query *q
                cs->buf[cs->cdw++] = 0;
                cs->buf[cs->cdw++] = 0;
                break;
+       case PIPE_QUERY_PIPELINE_STATISTICS:
+               if (!ctx->num_pipelinestat_queries) {
+                       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+                       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0);
+               }
+               ctx->num_pipelinestat_queries++;
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+               break;
        default:
                assert(0);
        }
        cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE);
 
        if (!r600_is_timer_query(query->type)) {
                ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
@@ -163,7 +185,7 @@ static void r600_emit_query_begin(struct r600_context *ctx, struct r600_query *q
 
 static void r600_emit_query_end(struct r600_context *ctx, struct r600_query *query)
 {
-       struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+       struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
        uint64_t va;
 
        /* The queries which need begin already called this in begin_query. */
@@ -171,7 +193,7 @@ static void r600_emit_query_end(struct r600_context *ctx, struct r600_query *que
                r600_need_cs_space(ctx, query->num_cs_dw, FALSE);
        }
 
-       va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer.buf);
+       va = r600_resource_va(&ctx->screen->b.b, (void*)query->buffer.buf);
        /* emit end query */
        switch (query->type) {
        case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -203,11 +225,24 @@ static void r600_emit_query_end(struct r600_context *ctx, struct r600_query *que
                cs->buf[cs->cdw++] = 0;
                cs->buf[cs->cdw++] = 0;
                break;
+       case PIPE_QUERY_PIPELINE_STATISTICS:
+               assert(ctx->num_pipelinestat_queries > 0);
+               ctx->num_pipelinestat_queries--;
+               if (!ctx->num_pipelinestat_queries) {
+                       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+                       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) | EVENT_INDEX(0);
+               }
+               va += query->buffer.results_end + query->result_size/2;
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+               break;
        default:
                assert(0);
        }
        cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE);
 
        query->buffer.results_end += query->result_size;
 
@@ -223,7 +258,7 @@ static void r600_emit_query_end(struct r600_context *ctx, struct r600_query *que
 static void r600_emit_query_predication(struct r600_context *ctx, struct r600_query *query,
                                        int operation, bool flag_wait)
 {
-       struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+       struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 
        if (operation == PREDICATION_OP_CLEAR) {
                r600_need_cs_space(ctx, 3, FALSE);
@@ -250,14 +285,14 @@ static void r600_emit_query_predication(struct r600_context *ctx, struct r600_qu
                /* emit predicate packets for all data blocks */
                for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
                        unsigned results_base = 0;
-                       uint64_t va = r600_resource_va(&ctx->screen->screen, &qbuf->buf->b.b);
+                       uint64_t va = r600_resource_va(&ctx->screen->b.b, &qbuf->buf->b.b);
 
                        while (results_base < qbuf->results_end) {
                                cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
                                cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
                                cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
                                cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-                               cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ);
+                               cs->buf[cs->cdw++] = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx, qbuf->buf, RADEON_USAGE_READ);
                                results_base += query->result_size;
 
                                /* set CONTINUE bit for all packets except the first */
@@ -270,8 +305,8 @@ static void r600_emit_query_predication(struct r600_context *ctx, struct r600_qu
 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type)
 {
        struct r600_context *rctx = (struct r600_context *)ctx;
-
        struct r600_query *query;
+       bool skip_allocation = false;
 
        query = CALLOC_STRUCT(r600_query);
        if (query == NULL)
@@ -301,16 +336,30 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
                query->result_size = 32;
                query->num_cs_dw = 6;
                break;
+       case PIPE_QUERY_PIPELINE_STATISTICS:
+               /* 11 values on EG, 8 on R600. */
+               query->result_size = (rctx->b.chip_class >= EVERGREEN ? 11 : 8) * 16;
+               query->num_cs_dw = 8;
+               break;
+       /* Non-GPU queries. */
+       case R600_QUERY_DRAW_CALLS:
+       case R600_QUERY_REQUESTED_VRAM:
+       case R600_QUERY_REQUESTED_GTT:
+       case R600_QUERY_BUFFER_WAIT_TIME:
+               skip_allocation = true;
+               break;
        default:
                assert(0);
                FREE(query);
                return NULL;
        }
 
-       query->buffer.buf = r600_new_query_buffer(rctx, query_type);
-       if (!query->buffer.buf) {
-               FREE(query);
-               return NULL;
+       if (!skip_allocation) {
+               query->buffer.buf = r600_new_query_buffer(rctx, query_type);
+               if (!query->buffer.buf) {
+                       FREE(query);
+                       return NULL;
+               }
        }
        return (struct pipe_query*)query;
 }
@@ -343,6 +392,20 @@ static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query)
                return;
        }
 
+       /* Non-GPU queries. */
+       switch (rquery->type) {
+       case R600_QUERY_DRAW_CALLS:
+               rquery->begin_result = rctx->num_draw_calls;
+               return;
+       case R600_QUERY_REQUESTED_VRAM:
+       case R600_QUERY_REQUESTED_GTT:
+               rquery->begin_result = 0;
+               return;
+       case R600_QUERY_BUFFER_WAIT_TIME:
+               rquery->begin_result = rctx->b.ws->query_value(rctx->b.ws, RADEON_BUFFER_WAIT_TIME_NS);
+               return;
+       }
+
        /* Discard the old query buffers. */
        while (prev) {
                struct r600_query_buffer *qbuf = prev;
@@ -353,7 +416,7 @@ static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 
        /* Obtain a new buffer if the current one can't be mapped without a stall. */
        if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
-           rctx->ws->buffer_is_busy(rquery->buffer.buf->buf, RADEON_USAGE_READWRITE)) {
+           rctx->b.ws->buffer_is_busy(rquery->buffer.buf->buf, RADEON_USAGE_READWRITE)) {
                pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
                rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
        }
@@ -373,6 +436,22 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct r600_query *rquery = (struct r600_query *)query;
 
+       /* Non-GPU queries. */
+       switch (rquery->type) {
+       case R600_QUERY_DRAW_CALLS:
+               rquery->end_result = rctx->num_draw_calls;
+               return;
+       case R600_QUERY_REQUESTED_VRAM:
+               rquery->end_result = rctx->b.ws->query_value(rctx->b.ws, RADEON_REQUESTED_VRAM_MEMORY);
+               return;
+       case R600_QUERY_REQUESTED_GTT:
+               rquery->end_result = rctx->b.ws->query_value(rctx->b.ws, RADEON_REQUESTED_GTT_MEMORY);
+               return;
+       case R600_QUERY_BUFFER_WAIT_TIME:
+               rquery->end_result = rctx->b.ws->query_value(rctx->b.ws, RADEON_BUFFER_WAIT_TIME_NS);
+               return;
+       }
+
        r600_emit_query_end(rctx, rquery);
 
        if (r600_query_needs_begin(rquery->type) && !r600_is_timer_query(rquery->type)) {
@@ -407,6 +486,16 @@ static boolean r600_get_query_buffer_result(struct r600_context *ctx,
        unsigned results_base = 0;
        char *map;
 
+       /* Non-GPU queries. */
+       switch (query->type) {
+       case R600_QUERY_DRAW_CALLS:
+       case R600_QUERY_REQUESTED_VRAM:
+       case R600_QUERY_REQUESTED_GTT:
+       case R600_QUERY_BUFFER_WAIT_TIME:
+               result->u64 = query->end_result - query->begin_result;
+               return TRUE;
+       }
+
        map = r600_buffer_mmap_sync_with_rings(ctx, qbuf->buf,
                                                PIPE_TRANSFER_READ |
                                                (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
@@ -481,11 +570,76 @@ static boolean r600_get_query_buffer_result(struct r600_context *ctx,
                        results_base += query->result_size;
                }
                break;
+       case PIPE_QUERY_PIPELINE_STATISTICS:
+               if (ctx->b.chip_class >= EVERGREEN) {
+                       while (results_base != qbuf->results_end) {
+                               result->pipeline_statistics.ps_invocations +=
+                                       r600_query_read_result(map + results_base, 0, 22, false);
+                               result->pipeline_statistics.c_primitives +=
+                                       r600_query_read_result(map + results_base, 2, 24, false);
+                               result->pipeline_statistics.c_invocations +=
+                                       r600_query_read_result(map + results_base, 4, 26, false);
+                               result->pipeline_statistics.vs_invocations +=
+                                       r600_query_read_result(map + results_base, 6, 28, false);
+                               result->pipeline_statistics.gs_invocations +=
+                                       r600_query_read_result(map + results_base, 8, 30, false);
+                               result->pipeline_statistics.gs_primitives +=
+                                       r600_query_read_result(map + results_base, 10, 32, false);
+                               result->pipeline_statistics.ia_primitives +=
+                                       r600_query_read_result(map + results_base, 12, 34, false);
+                               result->pipeline_statistics.ia_vertices +=
+                                       r600_query_read_result(map + results_base, 14, 36, false);
+                               result->pipeline_statistics.hs_invocations +=
+                                       r600_query_read_result(map + results_base, 16, 38, false);
+                               result->pipeline_statistics.ds_invocations +=
+                                       r600_query_read_result(map + results_base, 18, 40, false);
+                               result->pipeline_statistics.cs_invocations +=
+                                       r600_query_read_result(map + results_base, 20, 42, false);
+                               results_base += query->result_size;
+                       }
+               } else {
+                       while (results_base != qbuf->results_end) {
+                               result->pipeline_statistics.ps_invocations +=
+                                       r600_query_read_result(map + results_base, 0, 16, false);
+                               result->pipeline_statistics.c_primitives +=
+                                       r600_query_read_result(map + results_base, 2, 18, false);
+                               result->pipeline_statistics.c_invocations +=
+                                       r600_query_read_result(map + results_base, 4, 20, false);
+                               result->pipeline_statistics.vs_invocations +=
+                                       r600_query_read_result(map + results_base, 6, 22, false);
+                               result->pipeline_statistics.gs_invocations +=
+                                       r600_query_read_result(map + results_base, 8, 24, false);
+                               result->pipeline_statistics.gs_primitives +=
+                                       r600_query_read_result(map + results_base, 10, 26, false);
+                               result->pipeline_statistics.ia_primitives +=
+                                       r600_query_read_result(map + results_base, 12, 28, false);
+                               result->pipeline_statistics.ia_vertices +=
+                                       r600_query_read_result(map + results_base, 14, 30, false);
+                               results_base += query->result_size;
+                       }
+               }
+#if 0 /* for testing */
+               printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
+                      "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
+                      "Clipper prims=%llu, PS=%llu, CS=%llu\n",
+                      result->pipeline_statistics.ia_vertices,
+                      result->pipeline_statistics.ia_primitives,
+                      result->pipeline_statistics.vs_invocations,
+                      result->pipeline_statistics.hs_invocations,
+                      result->pipeline_statistics.ds_invocations,
+                      result->pipeline_statistics.gs_invocations,
+                      result->pipeline_statistics.gs_primitives,
+                      result->pipeline_statistics.c_invocations,
+                      result->pipeline_statistics.c_primitives,
+                      result->pipeline_statistics.ps_invocations,
+                      result->pipeline_statistics.cs_invocations);
+#endif
+               break;
        default:
                assert(0);
        }
 
-       ctx->ws->buffer_unmap(qbuf->buf->cs_buf);
+       ctx->b.ws->buffer_unmap(qbuf->buf->cs_buf);
        return TRUE;
 }
 
@@ -508,13 +662,14 @@ static boolean r600_get_query_result(struct pipe_context *ctx,
        /* Convert the time to expected units. */
        if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
            rquery->type == PIPE_QUERY_TIMESTAMP) {
-               result->u64 = (1000000 * result->u64) / rctx->screen->info.r600_clock_crystal_freq;
+               result->u64 = (1000000 * result->u64) / rctx->screen->b.info.r600_clock_crystal_freq;
        }
        return TRUE;
 }
 
 static void r600_render_condition(struct pipe_context *ctx,
                                  struct pipe_query *query,
+                                 boolean condition,
                                  uint mode)
 {
        struct r600_context *rctx = (struct r600_context *)ctx;
@@ -522,6 +677,7 @@ static void r600_render_condition(struct pipe_context *ctx,
        bool wait_flag = false;
 
        rctx->current_render_cond = query;
+       rctx->current_render_cond_cond = condition;
        rctx->current_render_cond_mode = mode;
 
        if (query == NULL) {
@@ -578,12 +734,12 @@ void r600_resume_nontimer_queries(struct r600_context *ctx)
 
 void r600_init_query_functions(struct r600_context *rctx)
 {
-       rctx->context.create_query = r600_create_query;
-       rctx->context.destroy_query = r600_destroy_query;
-       rctx->context.begin_query = r600_begin_query;
-       rctx->context.end_query = r600_end_query;
-       rctx->context.get_query_result = r600_get_query_result;
-
-       if (rctx->screen->info.r600_num_backends > 0)
-           rctx->context.render_condition = r600_render_condition;
+       rctx->b.b.create_query = r600_create_query;
+       rctx->b.b.destroy_query = r600_destroy_query;
+       rctx->b.b.begin_query = r600_begin_query;
+       rctx->b.b.end_query = r600_end_query;
+       rctx->b.b.get_query_result = r600_get_query_result;
+
+       if (rctx->screen->b.info.r600_num_backends > 0)
+           rctx->b.b.render_condition = r600_render_condition;
 }