From c359880d8be41628128ea699676643f4dd426047 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 24 Jul 2018 13:14:29 -0400 Subject: [PATCH] radeonsi: add SI_QUERY_TIME_ELAPSED_SDMA for measuring SDMA performance --- src/amd/common/sid.h | 4 ++++ src/gallium/drivers/radeonsi/si_dma_cs.c | 29 ++++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_pipe.h | 2 ++ src/gallium/drivers/radeonsi/si_query.c | 21 +++++++++++++++-- src/gallium/drivers/radeonsi/si_query.h | 1 + 5 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h index d9c4a1a7414..d696c01d4dd 100644 --- a/src/amd/common/sid.h +++ b/src/amd/common/sid.h @@ -9140,6 +9140,10 @@ #define CIK_SDMA_PACKET_TRAP 0x6 #define CIK_SDMA_PACKET_SEMAPHORE 0x7 #define CIK_SDMA_PACKET_CONSTANT_FILL 0xb +#define CIK_SDMA_OPCODE_TIMESTAMP 0xd +#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP 0x0 +#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP 0x1 +#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP 0x2 #define CIK_SDMA_PACKET_SRBM_WRITE 0xe #define CIK_SDMA_COPY_MAX_SIZE 0x3fffe0 diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index 3bb769309e3..7db9570af3c 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -23,6 +23,7 @@ */ #include "si_pipe.h" +#include "sid.h" static void si_dma_emit_wait_idle(struct si_context *sctx) { @@ -35,6 +36,34 @@ static void si_dma_emit_wait_idle(struct si_context *sctx) radeon_emit(cs, 0xf0000000); /* NOP */ } +void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst, + uint64_t offset) +{ + struct radeon_cmdbuf *cs = sctx->dma_cs; + uint64_t va = dst->gpu_address + offset; + + if (sctx->chip_class == SI) { + unreachable("SI DMA doesn't support the timestamp packet."); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&dst->valid_buffer_range, offset, offset + 8); + + assert(va % 8 == 0); + + si_need_dma_space(sctx, 4, dst, NULL); + si_dma_emit_wait_idle(sctx); + + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, + SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, + 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); +} + void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 95489f09612..4c3f13b84e2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1155,6 +1155,8 @@ bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); void si_init_dma_functions(struct si_context *sctx); /* si_dma_cs.c */ +void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst, + uint64_t offset); void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src); void si_flush_dma_cs(struct si_context *ctx, unsigned flags, diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index f768b531139..93efbd4ef4a 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -648,6 +648,11 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, query->result_size += 16; /* for the fence + alignment */ query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(sscreen); break; + case SI_QUERY_TIME_ELAPSED_SDMA: + /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */ + query->result_size = 64; + query->num_cs_dw_end = 0; + break; case PIPE_QUERY_TIME_ELAPSED: query->result_size = 24; query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen); @@ -747,6 +752,9 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct radeon_cmdbuf *cs = sctx->gfx_cs; switch (query->b.type) { + case SI_QUERY_TIME_ELAPSED_SDMA: + si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address); + return; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: @@ -802,7 +810,8 @@ static void si_query_hw_emit_start(struct si_context *sctx, si_update_occlusion_query_state(sctx, query->b.type, 1); si_update_prims_generated_query_state(sctx, query->b.type, 1); - si_need_gfx_cs_space(sctx); + if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA) + si_need_gfx_cs_space(sctx); /* Get a new query buffer if needed. */ if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) { @@ -832,6 +841,9 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, uint64_t fence_va = 0; switch (query->b.type) { + case SI_QUERY_TIME_ELAPSED_SDMA: + si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address); + return; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: @@ -1022,7 +1034,8 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED || - query_type >= PIPE_QUERY_DRIVER_SPECIFIC) + (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && + query_type != SI_QUERY_TIME_ELAPSED_SDMA)) return si_query_sw_create(query_type); return si_query_hw_create(sscreen, query_type, index); @@ -1238,6 +1251,9 @@ static void si_query_hw_add_result(struct si_screen *sscreen, case PIPE_QUERY_TIME_ELAPSED: result->u64 += si_query_read_result(buffer, 0, 2, false); break; + case SI_QUERY_TIME_ELAPSED_SDMA: + result->u64 += si_query_read_result(buffer, 0, 32/4, false); + break; case PIPE_QUERY_TIMESTAMP: result->u64 = *(uint64_t*)buffer; break; @@ -1382,6 +1398,7 @@ bool si_query_hw_get_result(struct si_context *sctx, /* Convert the time to expected units. */ if (rquery->type == PIPE_QUERY_TIME_ELAPSED || + rquery->type == SI_QUERY_TIME_ELAPSED_SDMA || rquery->type == PIPE_QUERY_TIMESTAMP) { result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq; } diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 3f60208e2f8..bc3eb397bc5 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -109,6 +109,7 @@ enum { SI_QUERY_GPIN_NUM_RB, SI_QUERY_GPIN_NUM_SPI, SI_QUERY_GPIN_NUM_SE, + SI_QUERY_TIME_ELAPSED_SDMA, SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, }; -- 2.30.2