radeonsi: add SI_QUERY_TIME_ELAPSED_SDMA for measuring SDMA performance
[mesa.git] / src / gallium / drivers / radeonsi / si_query.c
index f768b531139ea0a19c8d16efd52eb724671de4fe..93efbd4ef4a1445a2fa9de89a3dd9cee2d3b4dac 100644 (file)
@@ -648,6 +648,11 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
                query->result_size += 16; /* for the fence + alignment */
                query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(sscreen);
                break;
+       case SI_QUERY_TIME_ELAPSED_SDMA:
+               /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+               query->result_size = 64;
+               query->num_cs_dw_end = 0;
+               break;
        case PIPE_QUERY_TIME_ELAPSED:
                query->result_size = 24;
                query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen);
@@ -747,6 +752,9 @@ static void si_query_hw_do_emit_start(struct si_context *sctx,
        struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
        switch (query->b.type) {
+       case SI_QUERY_TIME_ELAPSED_SDMA:
+               si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+               return;
        case PIPE_QUERY_OCCLUSION_COUNTER:
        case PIPE_QUERY_OCCLUSION_PREDICATE:
        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
@@ -802,7 +810,8 @@ static void si_query_hw_emit_start(struct si_context *sctx,
        si_update_occlusion_query_state(sctx, query->b.type, 1);
        si_update_prims_generated_query_state(sctx, query->b.type, 1);
 
-       si_need_gfx_cs_space(sctx);
+       if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+               si_need_gfx_cs_space(sctx);
 
        /* Get a new query buffer if needed. */
        if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
@@ -832,6 +841,9 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx,
        uint64_t fence_va = 0;
 
        switch (query->b.type) {
+       case SI_QUERY_TIME_ELAPSED_SDMA:
+               si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+               return;
        case PIPE_QUERY_OCCLUSION_COUNTER:
        case PIPE_QUERY_OCCLUSION_PREDICATE:
        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
@@ -1022,7 +1034,8 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que
 
        if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
            query_type == PIPE_QUERY_GPU_FINISHED ||
-           query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
+           (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
+            query_type != SI_QUERY_TIME_ELAPSED_SDMA))
                return si_query_sw_create(query_type);
 
        return si_query_hw_create(sscreen, query_type, index);
@@ -1238,6 +1251,9 @@ static void si_query_hw_add_result(struct si_screen *sscreen,
        case PIPE_QUERY_TIME_ELAPSED:
                result->u64 += si_query_read_result(buffer, 0, 2, false);
                break;
+       case SI_QUERY_TIME_ELAPSED_SDMA:
+               result->u64 += si_query_read_result(buffer, 0, 32/4, false);
+               break;
        case PIPE_QUERY_TIMESTAMP:
                result->u64 = *(uint64_t*)buffer;
                break;
@@ -1382,6 +1398,7 @@ bool si_query_hw_get_result(struct si_context *sctx,
 
        /* Convert the time to expected units. */
        if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
+           rquery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
            rquery->type == PIPE_QUERY_TIMESTAMP) {
                result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
        }