/* Max counters per HW block */
#define SI_QUERY_MAX_COUNTERS 16
-#define SI_PC_SHADERS_WINDOWING (1 << 31)
+#define SI_PC_SHADERS_WINDOWING (1u << 31)
struct si_query_group {
struct si_query_group *next;
};
struct si_query_pc {
- struct si_query_hw b;
+ struct si_query b;
+ struct si_query_buffer buffer;
+
+ /* Size of the results in memory, in bytes. */
+ unsigned result_size;
unsigned shaders;
unsigned num_counters;
}
static void si_pc_emit_start(struct si_context *sctx,
- struct r600_resource *buffer, uint64_t va)
+ struct si_resource *buffer, uint64_t va)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
- RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
- COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM));
- radeon_emit(cs, 1); /* immediate */
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ si_cp_copy_data(sctx, sctx->gfx_cs,
+ COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
+ COPY_DATA_IMM, NULL, 1);
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
* do it again in here. */
static void si_pc_emit_stop(struct si_context *sctx,
- struct r600_resource *buffer, uint64_t va)
+ struct si_resource *buffer, uint64_t va)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
- si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+ si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
EOP_DATA_SEL_VALUE_32BIT,
buffer, va, 0, SI_NOT_QUERY);
- si_cp_wait_mem(sctx, va, 0, 0xffffffff, 0);
+ si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
- COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
+ COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_COUNT_SEL); /* 64 bits */
radeon_emit(cs, reg >> 2);
radeon_emit(cs, 0); /* unused */
for (idx = 0; idx < count; ++idx) {
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
- COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) |
+ COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_COUNT_SEL);
radeon_emit(cs, 0); /* immediate */
radeon_emit(cs, 0);
}
}
-static void si_pc_query_destroy(struct si_screen *sscreen,
- struct si_query *rquery)
+static void si_pc_query_destroy(struct si_context *sctx,
+ struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)rquery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
while (query->groups) {
struct si_query_group *group = query->groups;
FREE(query->counters);
- si_query_hw_destroy(sscreen, rquery);
-}
-
-static bool si_pc_query_prepare_buffer(struct si_screen *screen,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer)
-{
- /* no-op */
- return true;
+ si_query_buffer_destroy(sctx->screen, &query->buffer);
+ FREE(query);
}
-static void si_pc_query_emit_start(struct si_context *sctx,
+static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
+/*
struct si_query_hw *hwquery,
- struct r600_resource *buffer, uint64_t va)
+ struct si_resource *buffer, uint64_t va)*/
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- struct si_query_group *group;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
int current_se = -1;
int current_instance = -1;
+ if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
+ return;
+ si_need_gfx_cs_space(sctx);
+
if (query->shaders)
si_pc_emit_shaders(sctx, query->shaders);
- for (group = query->groups; group; group = group->next) {
+ for (struct si_query_group *group = query->groups; group; group = group->next) {
struct si_pc_block *block = group->block;
if (group->se != current_se || group->instance != current_instance) {
if (current_se != -1 || current_instance != -1)
si_pc_emit_instance(sctx, -1, -1);
- si_pc_emit_start(sctx, buffer, va);
+ uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ si_pc_emit_start(sctx, query->buffer.buf, va);
}
-static void si_pc_query_emit_stop(struct si_context *sctx,
- struct si_query_hw *hwquery,
- struct r600_resource *buffer, uint64_t va)
+static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
- struct si_query_group *group;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- si_pc_emit_stop(sctx, buffer, va);
+ if (!query->buffer.buf)
+ return;
- for (group = query->groups; group; group = group->next) {
+ uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ query->buffer.results_end += query->result_size;
+
+ si_pc_emit_stop(sctx, query->buffer.buf, va);
+
+ for (struct si_query_group *group = query->groups; group; group = group->next) {
struct si_pc_block *block = group->block;
unsigned se = group->se >= 0 ? group->se : 0;
unsigned se_end = se + 1;
si_pc_emit_instance(sctx, -1, -1);
}
-static void si_pc_query_clear_result(struct si_query_hw *hwquery,
- union pipe_query_result *result)
+static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+ si_query_buffer_reset(ctx, &query->buffer);
+
+ list_addtail(&query->b.active_list, &ctx->active_queries);
+ ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+
+ si_pc_query_resume(ctx, squery);
+
+ return true;
}
-static void si_pc_query_add_result(struct si_screen *screen,
- struct si_query_hw *hwquery,
+static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
+{
+ struct si_query_pc *query = (struct si_query_pc *)squery;
+
+ si_pc_query_suspend(ctx, squery);
+
+ LIST_DEL(&squery->active_list);
+ ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
+
+ return query->buffer.buf != NULL;
+}
+
+static void si_pc_query_add_result(struct si_query_pc *query,
void *buffer,
union pipe_query_result *result)
{
- struct si_query_pc *query = (struct si_query_pc *)hwquery;
uint64_t *results = buffer;
unsigned i, j;
}
}
-static struct si_query_ops batch_query_ops = {
+static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery,
+ bool wait, union pipe_query_result *result)
+{
+ struct si_query_pc *query = (struct si_query_pc *)squery;
+
+ memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+
+ for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned usage = PIPE_TRANSFER_READ |
+ (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ unsigned results_base = 0;
+ void *map;
+
+ if (squery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+ if (!map)
+ return false;
+
+ while (results_base != qbuf->results_end) {
+ si_pc_query_add_result(query, map + results_base, result);
+ results_base += query->result_size;
+ }
+ }
+
+ return true;
+}
+
+static const struct si_query_ops batch_query_ops = {
.destroy = si_pc_query_destroy,
- .begin = si_query_hw_begin,
- .end = si_query_hw_end,
- .get_result = si_query_hw_get_result
-};
+ .begin = si_pc_query_begin,
+ .end = si_pc_query_end,
+ .get_result = si_pc_query_get_result,
-static struct si_query_hw_ops batch_query_hw_ops = {
- .prepare_buffer = si_pc_query_prepare_buffer,
- .emit_start = si_pc_query_emit_start,
- .emit_stop = si_pc_query_emit_stop,
- .clear_result = si_pc_query_clear_result,
- .add_result = si_pc_query_add_result,
+ .suspend = si_pc_query_suspend,
+ .resume = si_pc_query_resume,
};
static struct si_query_group *get_group_state(struct si_screen *screen,
if (!query)
return NULL;
- query->b.b.ops = &batch_query_ops;
- query->b.ops = &batch_query_hw_ops;
+ query->b.ops = &batch_query_ops;
query->num_counters = num_queries;
}
/* Compute result bases and CS size per group */
- query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
- query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
+ query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
+ query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
i = 0;
for (group = query->groups; group; group = group->next) {
instances *= block->num_instances;
group->result_base = i;
- query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
+ query->result_size += sizeof(uint64_t) * instances * group->num_counters;
i += instances * group->num_counters;
read_dw = 6 * group->num_counters;
- query->b.num_cs_dw_end += instances * read_dw;
- query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
+ query->b.num_cs_dw_suspend += instances * read_dw;
+ query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
}
if (query->shaders) {
counter->qwords *= block->num_instances;
}
- if (!si_query_hw_init(screen, &query->b))
- goto error;
-
return (struct pipe_query *)query;
error:
- si_pc_query_destroy(screen, &query->b.b);
+ si_pc_query_destroy((struct si_context *)ctx, &query->b);
return NULL;
}
unsigned i;
switch (screen->info.chip_class) {
- case CIK:
+ case GFX7:
blocks = groups_CIK;
num_blocks = ARRAY_SIZE(groups_CIK);
break;
- case VI:
+ case GFX8:
blocks = groups_VI;
num_blocks = ARRAY_SIZE(groups_VI);
break;
blocks = groups_gfx9;
num_blocks = ARRAY_SIZE(groups_gfx9);
break;
- case SI:
+ case GFX6:
default:
return; /* not implemented */
}
if (screen->info.max_sh_per_se != 1) {
- /* This should not happen on non-SI chips. */
+ /* This should not happen on non-GFX6 chips. */
fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
"supported (inaccurate performance counters)\n",
screen->info.max_sh_per_se);
for (i = 0; i < num_blocks; ++i) {
struct si_pc_block *block = &pc->blocks[i];
block->b = &blocks[i];
- block->num_instances = block->b->instances;
+ block->num_instances = MAX2(1, block->b->instances);
if (!strcmp(block->b->b->name, "CB") ||
!strcmp(block->b->b->name, "DB"))