From ad1782cfb5eaa633582c8a7d026690878ab54064 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 4 May 2016 13:53:45 -0500 Subject: [PATCH] radeonsi: expose performance counters as 64 bit MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is useful for shader-related counters, since they tend to quickly exceed 32 bits. Reviewed-by: Marek Olšák --- src/gallium/drivers/radeon/r600_perfcounter.c | 22 +++++++++---------- src/gallium/drivers/radeonsi/si_perfcounter.c | 13 ++++++----- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c index 9ab17d9e04c..af9a692d150 100644 --- a/src/gallium/drivers/radeon/r600_perfcounter.c +++ b/src/gallium/drivers/radeon/r600_perfcounter.c @@ -84,8 +84,8 @@ struct r600_pc_group { struct r600_pc_counter { unsigned base; - unsigned dwords; - unsigned stride; + unsigned qwords; + unsigned stride; /* in uint64s */ }; #define R600_PC_SHADERS_WINDOWING (1 << 31) @@ -172,7 +172,7 @@ static void r600_pc_query_emit_stop(struct r600_common_context *ctx, pc->emit_read(ctx, block, group->num_counters, group->selectors, buffer, va); - va += 4 * group->num_counters; + va += sizeof(uint64_t) * group->num_counters; } while (group->instance < 0 && ++instance < block->num_instances); } while (++se < se_end); } @@ -194,15 +194,15 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx, union pipe_query_result *result) { struct r600_query_pc *query = (struct r600_query_pc *)hwquery; - uint32_t *results = buffer; + uint64_t *results = buffer; unsigned i, j; for (i = 0; i < query->num_counters; ++i) { struct r600_pc_counter *counter = &query->counters[i]; - for (j = 0; j < counter->dwords; ++j) { + for (j = 0; j < counter->qwords; ++j) { uint32_t value = results[counter->base + j * counter->stride]; - result->batch[i].u32 += value; + result->batch[i].u64 += value; } } } @@ -361,7 +361,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, instances *= block->num_instances; group->result_base = i; - query->b.result_size += 4 * instances * group->num_counters; + query->b.result_size += sizeof(uint64_t) * instances * group->num_counters; i += instances * group->num_counters; pc->get_size(block, group->num_counters, group->selectors, @@ -401,11 +401,11 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, counter->base = group->result_base + j; counter->stride = group->num_counters; - counter->dwords = 1; + counter->qwords = 1; if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) - counter->dwords = screen->info.max_se; + counter->qwords = screen->info.max_se; if (group->instance < 0) - counter->dwords *= block->num_instances; + counter->qwords *= block->num_instances; } if (!r600_query_hw_init(rctx, &query->b)) @@ -535,7 +535,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen, info->name = block->selector_names + sub * block->selector_name_stride; info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index; info->max_value.u64 = 0; - info->type = PIPE_DRIVER_QUERY_TYPE_UINT; + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; info->group_id = base_gid + sub / block->num_selectors; info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 04da197e70a..96007a523af 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -208,6 +208,7 @@ static struct si_pc_block_base cik_PA_SC = { .layout = SI_PC_MULTI_ALTERNATE, }; +/* According to docs, PA_SU counters are only 48 bits wide. */ static struct si_pc_block_base cik_PA_SU = { .name = "PA_SU", .num_counters = 4, @@ -651,24 +652,26 @@ static void si_pc_emit_read(struct r600_common_context *ctx, radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | - COPY_DATA_DST_SEL(COPY_DATA_MEM)); + COPY_DATA_DST_SEL(COPY_DATA_MEM) | + COPY_DATA_COUNT_SEL); /* 64 bits */ radeon_emit(cs, reg >> 2); radeon_emit(cs, 0); /* unused */ radeon_emit(cs, va); radeon_emit(cs, va >> 32); - va += 4; + va += sizeof(uint64_t); reg += reg_delta; } } else { for (idx = 0; idx < count; ++idx) { radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | - COPY_DATA_DST_SEL(COPY_DATA_MEM)); + COPY_DATA_DST_SEL(COPY_DATA_MEM) | + COPY_DATA_COUNT_SEL); radeon_emit(cs, 0); /* immediate */ - radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); radeon_emit(cs, va); radeon_emit(cs, va >> 32); - va += 4; + va += sizeof(uint64_t); } } } -- 2.30.2