X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_perfcounter.c;h=974ac430c530a6a48a12eedb830f6d07a35025f7;hb=c046551e60342616a0a216bf1fb54b92b9d7313f;hp=69e149c76b618d448cec2bda4af71c619b72bbe8;hpb=e2b9329f17eaf94c0cb6cc9f9bad907500fedeba;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 69e149c76b6..974ac430c53 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -126,7 +126,7 @@ static const unsigned si_pc_shader_type_bits[] = { /* Max counters per HW block */ #define SI_QUERY_MAX_COUNTERS 16 -#define SI_PC_SHADERS_WINDOWING (1 << 31) +#define SI_PC_SHADERS_WINDOWING (1u << 31) struct si_query_group { struct si_query_group *next; @@ -146,7 +146,11 @@ struct si_query_counter { }; struct si_query_pc { - struct si_query_hw b; + struct si_query b; + struct si_query_buffer buffer; + + /* Size of the results in memory, in bytes. */ + unsigned result_size; unsigned shaders; unsigned num_counters; @@ -663,20 +667,13 @@ static void si_pc_emit_select(struct si_context *sctx, } static void si_pc_emit_start(struct si_context *sctx, - struct r600_resource *buffer, uint64_t va) + struct si_resource *buffer, uint64_t va) { struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, - RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM)); - radeon_emit(cs, 1); /* immediate */ - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + si_cp_copy_data(sctx, sctx->gfx_cs, + COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address, + COPY_DATA_IMM, NULL, 1); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET)); @@ -689,15 +686,15 @@ static void si_pc_emit_start(struct si_context *sctx, /* Note: The buffer was already added in si_pc_emit_start, so we don't have to * do it again in here. */ static void si_pc_emit_stop(struct si_context *sctx, - struct r600_resource *buffer, uint64_t va) + struct si_resource *buffer, uint64_t va) { struct radeon_cmdbuf *cs = sctx->gfx_cs; - si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0, + si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY); - si_cp_wait_mem(sctx, va, 0, 0xffffffff, 0); + si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); @@ -728,7 +725,7 @@ static void si_pc_emit_read(struct si_context *sctx, radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) | + COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_COUNT_SEL); /* 64 bits */ radeon_emit(cs, reg >> 2); radeon_emit(cs, 0); /* unused */ @@ -741,7 +738,7 @@ static void si_pc_emit_read(struct si_context *sctx, for (idx = 0; idx < count; ++idx) { radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) | + COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_COUNT_SEL); radeon_emit(cs, 0); /* immediate */ radeon_emit(cs, 0); @@ -752,10 +749,10 @@ static void si_pc_emit_read(struct si_context *sctx, } } -static void si_pc_query_destroy(struct si_screen *sscreen, - struct si_query *rquery) +static void si_pc_query_destroy(struct si_context *sctx, + struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)rquery; + struct si_query_pc *query = (struct si_query_pc *)squery; while (query->groups) { struct si_query_group *group = query->groups; @@ -765,30 +762,27 @@ static void si_pc_query_destroy(struct si_screen *sscreen, FREE(query->counters); - si_query_hw_destroy(sscreen, rquery); -} - -static bool si_pc_query_prepare_buffer(struct si_screen *screen, - struct si_query_hw *hwquery, - struct r600_resource *buffer) -{ - /* no-op */ - return true; + si_query_buffer_destroy(sctx->screen, &query->buffer); + FREE(query); } -static void si_pc_query_emit_start(struct si_context *sctx, +static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery) +/* struct si_query_hw *hwquery, - struct r600_resource *buffer, uint64_t va) + struct si_resource *buffer, uint64_t va)*/ { - struct si_query_pc *query = (struct si_query_pc *)hwquery; - struct si_query_group *group; + struct si_query_pc *query = (struct si_query_pc *)squery; int current_se = -1; int current_instance = -1; + if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size)) + return; + si_need_gfx_cs_space(sctx); + if (query->shaders) si_pc_emit_shaders(sctx, query->shaders); - for (group = query->groups; group; group = group->next) { + for (struct si_query_group *group = query->groups; group; group = group->next) { struct si_pc_block *block = group->block; if (group->se != current_se || group->instance != current_instance) { @@ -803,19 +797,23 @@ static void si_pc_query_emit_start(struct si_context *sctx, if (current_se != -1 || current_instance != -1) si_pc_emit_instance(sctx, -1, -1); - si_pc_emit_start(sctx, buffer, va); + uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end; + si_pc_emit_start(sctx, query->buffer.buf, va); } -static void si_pc_query_emit_stop(struct si_context *sctx, - struct si_query_hw *hwquery, - struct r600_resource *buffer, uint64_t va) +static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)hwquery; - struct si_query_group *group; + struct si_query_pc *query = (struct si_query_pc *)squery; - si_pc_emit_stop(sctx, buffer, va); + if (!query->buffer.buf) + return; - for (group = query->groups; group; group = group->next) { + uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end; + query->buffer.results_end += query->result_size; + + si_pc_emit_stop(sctx, query->buffer.buf, va); + + for (struct si_query_group *group = query->groups; group; group = group->next) { struct si_pc_block *block = group->block; unsigned se = group->se >= 0 ? group->se : 0; unsigned se_end = se + 1; @@ -837,20 +835,36 @@ static void si_pc_query_emit_stop(struct si_context *sctx, si_pc_emit_instance(sctx, -1, -1); } -static void si_pc_query_clear_result(struct si_query_hw *hwquery, - union pipe_query_result *result) +static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)hwquery; + struct si_query_pc *query = (struct si_query_pc *)squery; - memset(result, 0, sizeof(result->batch[0]) * query->num_counters); + si_query_buffer_reset(ctx, &query->buffer); + + list_addtail(&query->b.active_list, &ctx->active_queries); + ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; + + si_pc_query_resume(ctx, squery); + + return true; } -static void si_pc_query_add_result(struct si_screen *screen, - struct si_query_hw *hwquery, +static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery) +{ + struct si_query_pc *query = (struct si_query_pc *)squery; + + si_pc_query_suspend(ctx, squery); + + list_del(&squery->active_list); + ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend; + + return query->buffer.buf != NULL; +} + +static void si_pc_query_add_result(struct si_query_pc *query, void *buffer, union pipe_query_result *result) { - struct si_query_pc *query = (struct si_query_pc *)hwquery; uint64_t *results = buffer; unsigned i, j; @@ -864,19 +878,44 @@ static void si_pc_query_add_result(struct si_screen *screen, } } -static struct si_query_ops batch_query_ops = { +static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, + bool wait, union pipe_query_result *result) +{ + struct si_query_pc *query = (struct si_query_pc *)squery; + + memset(result, 0, sizeof(result->batch[0]) * query->num_counters); + + for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned usage = PIPE_TRANSFER_READ | + (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + unsigned results_base = 0; + void *map; + + if (squery->b.flushed) + map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); + + if (!map) + return false; + + while (results_base != qbuf->results_end) { + si_pc_query_add_result(query, map + results_base, result); + results_base += query->result_size; + } + } + + return true; +} + +static const struct si_query_ops batch_query_ops = { .destroy = si_pc_query_destroy, - .begin = si_query_hw_begin, - .end = si_query_hw_end, - .get_result = si_query_hw_get_result -}; + .begin = si_pc_query_begin, + .end = si_pc_query_end, + .get_result = si_pc_query_get_result, -static struct si_query_hw_ops batch_query_hw_ops = { - .prepare_buffer = si_pc_query_prepare_buffer, - .emit_start = si_pc_query_emit_start, - .emit_stop = si_pc_query_emit_stop, - .clear_result = si_pc_query_clear_result, - .add_result = si_pc_query_add_result, + .suspend = si_pc_query_suspend, + .resume = si_pc_query_resume, }; static struct si_query_group *get_group_state(struct si_screen *screen, @@ -966,8 +1005,7 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, if (!query) return NULL; - query->b.b.ops = &batch_query_ops; - query->b.ops = &batch_query_hw_ops; + query->b.ops = &batch_query_ops; query->num_counters = num_queries; @@ -1001,8 +1039,8 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, } /* Compute result bases and CS size per group */ - query->b.num_cs_dw_end = pc->num_stop_cs_dwords; - query->b.num_cs_dw_end += pc->num_instance_cs_dwords; + query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords; + query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords; i = 0; for (group = query->groups; group; group = group->next) { @@ -1016,12 +1054,12 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, instances *= block->num_instances; group->result_base = i; - query->b.result_size += sizeof(uint64_t) * instances * group->num_counters; + query->result_size += sizeof(uint64_t) * instances * group->num_counters; i += instances * group->num_counters; read_dw = 6 * group->num_counters; - query->b.num_cs_dw_end += instances * read_dw; - query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords; + query->b.num_cs_dw_suspend += instances * read_dw; + query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords; } if (query->shaders) { @@ -1059,13 +1097,10 @@ struct pipe_query *si_create_batch_query(struct pipe_context *ctx, counter->qwords *= block->num_instances; } - if (!si_query_hw_init(screen, &query->b)) - goto error; - return (struct pipe_query *)query; error: - si_pc_query_destroy(screen, &query->b.b); + si_pc_query_destroy((struct si_context *)ctx, &query->b); return NULL; } @@ -1249,11 +1284,11 @@ void si_init_perfcounters(struct si_screen *screen) unsigned i; switch (screen->info.chip_class) { - case CIK: + case GFX7: blocks = groups_CIK; num_blocks = ARRAY_SIZE(groups_CIK); break; - case VI: + case GFX8: blocks = groups_VI; num_blocks = ARRAY_SIZE(groups_VI); break; @@ -1261,13 +1296,13 @@ void si_init_perfcounters(struct si_screen *screen) blocks = groups_gfx9; num_blocks = ARRAY_SIZE(groups_gfx9); break; - case SI: + case GFX6: default: return; /* not implemented */ } if (screen->info.max_sh_per_se != 1) { - /* This should not happen on non-SI chips. */ + /* This should not happen on non-GFX6 chips. */ fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not " "supported (inaccurate performance counters)\n", screen->info.max_sh_per_se); @@ -1291,7 +1326,7 @@ void si_init_perfcounters(struct si_screen *screen) for (i = 0; i < num_blocks; ++i) { struct si_pc_block *block = &pc->blocks[i]; block->b = &blocks[i]; - block->num_instances = block->b->instances; + block->num_instances = MAX2(1, block->b->instances); if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB"))