radeonsi: expose performance counters as 64 bit
authorNicolai Hähnle <nicolai.haehnle@amd.com>
Wed, 4 May 2016 18:53:45 +0000 (13:53 -0500)
committerNicolai Hähnle <nicolai.haehnle@amd.com>
Mon, 9 May 2016 16:52:46 +0000 (11:52 -0500)
This is useful for shader-related counters, since they tend to quickly
exceed 32 bits.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
src/gallium/drivers/radeon/r600_perfcounter.c
src/gallium/drivers/radeonsi/si_perfcounter.c

index 9ab17d9e04cb916c8da3695716a002cf548795e9..af9a692d15061295627a4fe67259176dc3142e30 100644 (file)
@@ -84,8 +84,8 @@ struct r600_pc_group {
 
 struct r600_pc_counter {
        unsigned base;
-       unsigned dwords;
-       unsigned stride;
+       unsigned qwords;
+       unsigned stride; /* in uint64s */
 };
 
 #define R600_PC_SHADERS_WINDOWING (1 << 31)
@@ -172,7 +172,7 @@ static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
                                pc->emit_read(ctx, block,
                                              group->num_counters, group->selectors,
                                              buffer, va);
-                               va += 4 * group->num_counters;
+                               va += sizeof(uint64_t) * group->num_counters;
                        } while (group->instance < 0 && ++instance < block->num_instances);
                } while (++se < se_end);
        }
@@ -194,15 +194,15 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx,
                                     union pipe_query_result *result)
 {
        struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
-       uint32_t *results = buffer;
+       uint64_t *results = buffer;
        unsigned i, j;
 
        for (i = 0; i < query->num_counters; ++i) {
                struct r600_pc_counter *counter = &query->counters[i];
 
-               for (j = 0; j < counter->dwords; ++j) {
+               for (j = 0; j < counter->qwords; ++j) {
                        uint32_t value = results[counter->base + j * counter->stride];
-                       result->batch[i].u32 += value;
+                       result->batch[i].u64 += value;
                }
        }
 }
@@ -361,7 +361,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
                        instances *= block->num_instances;
 
                group->result_base = i;
-               query->b.result_size += 4 * instances * group->num_counters;
+               query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
                i += instances * group->num_counters;
 
                pc->get_size(block, group->num_counters, group->selectors,
@@ -401,11 +401,11 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
                counter->base = group->result_base + j;
                counter->stride = group->num_counters;
 
-               counter->dwords = 1;
+               counter->qwords = 1;
                if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
-                       counter->dwords = screen->info.max_se;
+                       counter->qwords = screen->info.max_se;
                if (group->instance < 0)
-                       counter->dwords *= block->num_instances;
+                       counter->qwords *= block->num_instances;
        }
 
        if (!r600_query_hw_init(rctx, &query->b))
@@ -535,7 +535,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen,
        info->name = block->selector_names + sub * block->selector_name_stride;
        info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
        info->max_value.u64 = 0;
-       info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+       info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
        info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
        info->group_id = base_gid + sub / block->num_selectors;
        info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
index 04da197e70a301fadbca38785ee493ce87014b45..96007a523af6b2e2b1c19db86ecddda12a4c891f 100644 (file)
@@ -208,6 +208,7 @@ static struct si_pc_block_base cik_PA_SC = {
        .layout = SI_PC_MULTI_ALTERNATE,
 };
 
+/* According to docs, PA_SU counters are only 48 bits wide. */
 static struct si_pc_block_base cik_PA_SU = {
        .name = "PA_SU",
        .num_counters = 4,
@@ -651,24 +652,26 @@ static void si_pc_emit_read(struct r600_common_context *ctx,
 
                        radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
                        radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-                                       COPY_DATA_DST_SEL(COPY_DATA_MEM));
+                                       COPY_DATA_DST_SEL(COPY_DATA_MEM) |
+                                       COPY_DATA_COUNT_SEL); /* 64 bits */
                        radeon_emit(cs, reg >> 2);
                        radeon_emit(cs, 0); /* unused */
                        radeon_emit(cs, va);
                        radeon_emit(cs, va >> 32);
-                       va += 4;
+                       va += sizeof(uint64_t);
                        reg += reg_delta;
                }
        } else {
                for (idx = 0; idx < count; ++idx) {
                        radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
                        radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
-                                       COPY_DATA_DST_SEL(COPY_DATA_MEM));
+                                       COPY_DATA_DST_SEL(COPY_DATA_MEM) |
+                                       COPY_DATA_COUNT_SEL);
                        radeon_emit(cs, 0); /* immediate */
-                       radeon_emit(cs, 0); /* unused */
+                       radeon_emit(cs, 0);
                        radeon_emit(cs, va);
                        radeon_emit(cs, va >> 32);
-                       va += 4;
+                       va += sizeof(uint64_t);
                }
        }
 }