freedreno/a5xx: perfmance counters
authorRob Clark <robdclark@gmail.com>
Wed, 18 Jul 2018 13:42:29 +0000 (09:42 -0400)
committerRob Clark <robdclark@gmail.com>
Wed, 18 Jul 2018 14:19:03 +0000 (10:19 -0400)
AMD_performance_monitor support

Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/Makefile.sources
src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c [new file with mode: 0644]
src/gallium/drivers/freedreno/a5xx/fd5_query.c
src/gallium/drivers/freedreno/a5xx/fd5_screen.c
src/gallium/drivers/freedreno/freedreno_screen.c
src/gallium/drivers/freedreno/freedreno_util.h
src/gallium/drivers/freedreno/meson.build

index f3b1abd998abe53853665be8843c4cc6011ff56d..328cbdfbf94911e6fadbc1327e3eeae14486c005 100644 (file)
@@ -145,6 +145,7 @@ a5xx_SOURCES := \
        a5xx/fd5_gmem.h \
        a5xx/fd5_image.c \
        a5xx/fd5_image.h \
+       a5xx/fd5_perfcntr.c \
        a5xx/fd5_program.c \
        a5xx/fd5_program.h \
        a5xx/fd5_query.c \
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c b/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c
new file mode 100644 (file)
index 0000000..cf5571d
--- /dev/null
@@ -0,0 +1,766 @@
+/*
+ * Copyright (C) 2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FD5_PERFCNTR_H_
+#define FD5_PERFCNTR_H_
+
+#include "freedreno_perfcntr.h"
+#include "fd5_format.h"
+
+#define REG(_x) REG_A5XX_ ## _x
+
+#define COUNTER(_sel, _lo, _hi) {  \
+       .select_reg = REG(_sel),       \
+       .counter_reg_lo = REG(_lo),    \
+       .counter_reg_hi = REG(_hi),    \
+}
+
+#define COUNTER2(_sel, _lo, _hi, _en, _clr) { \
+       .select_reg     = REG(_sel),  \
+       .counter_reg_lo = REG(_lo),   \
+       .counter_reg_hi = REG(_hi),   \
+       .enable         = REG(_en),   \
+       .clear          = REG(_clr),  \
+}
+
+#define COUNTABLE(_selector, _query_type, _result_type) {            \
+       .name        = #_selector,                                       \
+       .selector    = _selector,                                        \
+       .query_type  = PIPE_DRIVER_QUERY_TYPE_ ## _query_type,           \
+       .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type,   \
+}
+
+#define GROUP(_name, _counters, _countables) {   \
+       .name           = _name,                     \
+       .num_counters   = ARRAY_SIZE(_counters),     \
+       .counters       = _counters,                 \
+       .num_countables = ARRAY_SIZE(_countables),   \
+       .countables     = _countables,               \
+}
+
+static const struct fd_perfcntr_counter cp_counters[] = {
+//RESERVED: for kernel
+//     COUNTER(CP_PERFCTR_CP_SEL_0, RBBM_PERFCTR_CP_0_LO, RBBM_PERFCTR_CP_0_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_1, RBBM_PERFCTR_CP_1_LO, RBBM_PERFCTR_CP_1_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_2, RBBM_PERFCTR_CP_2_LO, RBBM_PERFCTR_CP_2_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_3, RBBM_PERFCTR_CP_3_LO, RBBM_PERFCTR_CP_3_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_4, RBBM_PERFCTR_CP_4_LO, RBBM_PERFCTR_CP_4_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_5, RBBM_PERFCTR_CP_5_LO, RBBM_PERFCTR_CP_5_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_6, RBBM_PERFCTR_CP_6_LO, RBBM_PERFCTR_CP_6_HI),
+       COUNTER(CP_PERFCTR_CP_SEL_7, RBBM_PERFCTR_CP_7_LO, RBBM_PERFCTR_CP_7_HI),
+};
+
+static const struct fd_perfcntr_countable cp_countables[] = {
+       COUNTABLE(PERF_CP_ALWAYS_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_BUSY_GFX_CORE_IDLE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_IDLE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_BUSY_WORKING, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_STALL_CYCLES_ANY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_STARVE_CYCLES_ANY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_ICACHE_MISS, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_ICACHE_HIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PFP_MATCH_PM4_PKT_PROFILE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_BUSY_WORKING, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_IDLE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_STARVE_CYCLES_ANY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_FIFO_EMPTY_PFP_IDLE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_FIFO_EMPTY_PFP_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_FIFO_FULL_ME_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_FIFO_FULL_ME_NON_WORKING, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_STALL_CYCLES_ANY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_ICACHE_MISS, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ME_ICACHE_HIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_NUM_PREEMPTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PREEMPTION_REACTION_DELAY, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PREEMPTION_SWITCH_OUT_TIME, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PREEMPTION_SWITCH_IN_TIME, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_DEAD_DRAWS_IN_BIN_RENDER, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_PREDICATED_DRAWS_KILLED, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_MODE_SWITCH, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_ZPASS_DONE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_CONTEXT_DONE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_CACHE_FLUSH, UINT64, AVERAGE),
+       COUNTABLE(PERF_CP_LONG_PREEMPTIONS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter ccu_counters[] = {
+       COUNTER(RB_PERFCTR_CCU_SEL_0, RBBM_PERFCTR_CCU_0_LO, RBBM_PERFCTR_CCU_0_HI),
+       COUNTER(RB_PERFCTR_CCU_SEL_1, RBBM_PERFCTR_CCU_1_LO, RBBM_PERFCTR_CCU_1_HI),
+       COUNTER(RB_PERFCTR_CCU_SEL_2, RBBM_PERFCTR_CCU_2_LO, RBBM_PERFCTR_CCU_2_HI),
+       COUNTER(RB_PERFCTR_CCU_SEL_3, RBBM_PERFCTR_CCU_3_LO, RBBM_PERFCTR_CCU_3_HI),
+};
+
+static const struct fd_perfcntr_countable ccu_countables[] = {
+       COUNTABLE(PERF_CCU_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_STALL_CYCLES_RB_DEPTH_RETURN, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_STALL_CYCLES_RB_COLOR_RETURN, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_STARVE_CYCLES_FLAG_RETURN, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_BLOCKS, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_BLOCKS, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_BLOCK_HIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_BLOCK_HIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_PARTIAL_BLOCK_READ, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_GMEM_READ, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_GMEM_WRITE, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_READ_FLAG0_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_READ_FLAG1_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_READ_FLAG2_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_READ_FLAG3_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_DEPTH_READ_FLAG4_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_READ_FLAG0_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_READ_FLAG1_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_READ_FLAG2_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_READ_FLAG3_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_COLOR_READ_FLAG4_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_2D_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_2D_RD_REQ, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_2D_WR_REQ, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_2D_REORDER_STARVE_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_CCU_2D_PIXELS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter tse_counters[] = {
+       COUNTER(GRAS_PERFCTR_TSE_SEL_0, RBBM_PERFCTR_TSE_0_LO, RBBM_PERFCTR_TSE_0_HI),
+       COUNTER(GRAS_PERFCTR_TSE_SEL_1, RBBM_PERFCTR_TSE_1_LO, RBBM_PERFCTR_TSE_1_HI),
+       COUNTER(GRAS_PERFCTR_TSE_SEL_2, RBBM_PERFCTR_TSE_2_LO, RBBM_PERFCTR_TSE_2_HI),
+       COUNTER(GRAS_PERFCTR_TSE_SEL_3, RBBM_PERFCTR_TSE_3_LO, RBBM_PERFCTR_TSE_3_HI),
+};
+
+static const struct fd_perfcntr_countable tse_countables[] = {
+       COUNTABLE(PERF_TSE_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_CLIPPING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_STALL_CYCLES_RAS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_BARYPLANE, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_ZPLANE, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_STARVE_CYCLES_PC, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_INPUT_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_INPUT_NULL_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_TRIVAL_REJ_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_CLIPPED_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_ZERO_AREA_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_FACENESS_CULLED_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_ZERO_PIXEL_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_OUTPUT_NULL_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_OUTPUT_VISIBLE_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_CINVOCATION, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_CPRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_2D_INPUT_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_TSE_2D_ALIVE_CLCLES, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter ras_counters[] = {
+       COUNTER(GRAS_PERFCTR_RAS_SEL_0, RBBM_PERFCTR_RAS_0_LO, RBBM_PERFCTR_RAS_0_HI),
+       COUNTER(GRAS_PERFCTR_RAS_SEL_1, RBBM_PERFCTR_RAS_1_LO, RBBM_PERFCTR_RAS_1_HI),
+       COUNTER(GRAS_PERFCTR_RAS_SEL_2, RBBM_PERFCTR_RAS_2_LO, RBBM_PERFCTR_RAS_2_HI),
+       COUNTER(GRAS_PERFCTR_RAS_SEL_3, RBBM_PERFCTR_RAS_3_LO, RBBM_PERFCTR_RAS_3_HI),
+};
+
+static const struct fd_perfcntr_countable ras_countables[] = {
+       COUNTABLE(PERF_RAS_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_SUPERTILE_ACTIVE_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_STALL_CYCLES_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_STARVE_CYCLES_TSE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_SUPER_TILES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_8X4_TILES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_MASKGEN_ACTIVE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_FULLY_COVERED_SUPER_TILES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_FULLY_COVERED_8X4_TILES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RAS_PRIM_KILLED_INVISILBE, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter lrz_counters[] = {
+       COUNTER(GRAS_PERFCTR_LRZ_SEL_0, RBBM_PERFCTR_LRZ_0_LO, RBBM_PERFCTR_LRZ_0_HI),
+       COUNTER(GRAS_PERFCTR_LRZ_SEL_1, RBBM_PERFCTR_LRZ_1_LO, RBBM_PERFCTR_LRZ_1_HI),
+       COUNTER(GRAS_PERFCTR_LRZ_SEL_2, RBBM_PERFCTR_LRZ_2_LO, RBBM_PERFCTR_LRZ_2_HI),
+       COUNTER(GRAS_PERFCTR_LRZ_SEL_3, RBBM_PERFCTR_LRZ_3_LO, RBBM_PERFCTR_LRZ_3_HI),
+};
+
+static const struct fd_perfcntr_countable lrz_countables[] = {
+       COUNTABLE(PERF_LRZ_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_STARVE_CYCLES_RAS, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_STALL_CYCLES_RB, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_STALL_CYCLES_VSC, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_STALL_CYCLES_VPC, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_STALL_CYCLES_FLAG_PREFETCH, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_LRZ_READ, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_LRZ_WRITE, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_READ_LATENCY, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_MERGE_CACHE_UPDATING, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_MASKGEN, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_VISIBLE_PRIM_AFTER_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_FULL_8X8_TILES, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_PARTIAL_8X8_TILES, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_TILE_KILLED, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_TOTAL_PIXEL, UINT64, AVERAGE),
+       COUNTABLE(PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter hlsq_counters[] = {
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_0, RBBM_PERFCTR_HLSQ_0_LO, RBBM_PERFCTR_HLSQ_0_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_1, RBBM_PERFCTR_HLSQ_1_LO, RBBM_PERFCTR_HLSQ_1_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_2, RBBM_PERFCTR_HLSQ_2_LO, RBBM_PERFCTR_HLSQ_2_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_3, RBBM_PERFCTR_HLSQ_3_LO, RBBM_PERFCTR_HLSQ_3_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_4, RBBM_PERFCTR_HLSQ_4_LO, RBBM_PERFCTR_HLSQ_4_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_5, RBBM_PERFCTR_HLSQ_5_LO, RBBM_PERFCTR_HLSQ_5_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_6, RBBM_PERFCTR_HLSQ_6_LO, RBBM_PERFCTR_HLSQ_6_HI),
+       COUNTER(HLSQ_PERFCTR_HLSQ_SEL_7, RBBM_PERFCTR_HLSQ_7_LO, RBBM_PERFCTR_HLSQ_7_HI),
+};
+
+static const struct fd_perfcntr_countable hlsq_countables[] = {
+       COUNTABLE(PERF_HLSQ_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_STATE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_FS_STAGE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_UCHE_LATENCY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_UCHE_LATENCY_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_FS_STAGE_32_WAVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_FS_STAGE_64_WAVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_QUADS, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_SP_STATE_COPY_TRANS_FS_STAGE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_SP_STATE_COPY_TRANS_VS_STAGE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_TP_STATE_COPY_TRANS_FS_STAGE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_TP_STATE_COPY_TRANS_VS_STAGE, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_CS_INVOCATIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_HLSQ_COMPUTE_DRAWCALLS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter pc_counters[] = {
+       COUNTER(PC_PERFCTR_PC_SEL_0, RBBM_PERFCTR_PC_0_LO, RBBM_PERFCTR_PC_0_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_1, RBBM_PERFCTR_PC_1_LO, RBBM_PERFCTR_PC_1_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_2, RBBM_PERFCTR_PC_2_LO, RBBM_PERFCTR_PC_2_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_3, RBBM_PERFCTR_PC_3_LO, RBBM_PERFCTR_PC_3_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_4, RBBM_PERFCTR_PC_4_LO, RBBM_PERFCTR_PC_4_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_5, RBBM_PERFCTR_PC_5_LO, RBBM_PERFCTR_PC_5_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_6, RBBM_PERFCTR_PC_6_LO, RBBM_PERFCTR_PC_6_HI),
+       COUNTER(PC_PERFCTR_PC_SEL_7, RBBM_PERFCTR_PC_7_LO, RBBM_PERFCTR_PC_7_HI),
+};
+
+static const struct fd_perfcntr_countable pc_countables[] = {
+       COUNTABLE(PERF_PC_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_WORKING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_VFD, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_TSE, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_VPC, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_TESS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_TSE_ONLY, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STALL_CYCLES_VPC_ONLY, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_PASS1_TF_STALL_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_INDEX, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_TESS_FACTOR, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_VIZ_STREAM, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_POSITION, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_STARVE_CYCLES_DI, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_VIS_STREAMS_LOADED, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_INSTANCES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_VPC_PRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_DEAD_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_LIVE_PRIM, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_VERTEX_HITS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_IA_VERTICES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_IA_PRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_GS_PRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_HS_INVOCATIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_DS_INVOCATIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_VS_INVOCATIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_GS_INVOCATIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_DS_PRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_VPC_POS_DATA_TRANSACTION, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_3D_DRAWCALLS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_2D_DRAWCALLS, UINT64, AVERAGE),
+       COUNTABLE(PERF_PC_NON_DRAWCALL_GLOBAL_EVENTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TESS_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TESS_WORKING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TESS_STALL_CYCLES_PC, UINT64, AVERAGE),
+       COUNTABLE(PERF_TESS_STARVE_CYCLES_PC, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter rb_counters[] = {
+       COUNTER(RB_PERFCTR_RB_SEL_0, RBBM_PERFCTR_RB_0_LO, RBBM_PERFCTR_RB_0_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_1, RBBM_PERFCTR_RB_1_LO, RBBM_PERFCTR_RB_1_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_2, RBBM_PERFCTR_RB_2_LO, RBBM_PERFCTR_RB_2_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_3, RBBM_PERFCTR_RB_3_LO, RBBM_PERFCTR_RB_3_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_4, RBBM_PERFCTR_RB_4_LO, RBBM_PERFCTR_RB_4_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_5, RBBM_PERFCTR_RB_5_LO, RBBM_PERFCTR_RB_5_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_6, RBBM_PERFCTR_RB_6_LO, RBBM_PERFCTR_RB_6_HI),
+       COUNTER(RB_PERFCTR_RB_SEL_7, RBBM_PERFCTR_RB_7_LO, RBBM_PERFCTR_RB_7_HI),
+};
+
+static const struct fd_perfcntr_countable rb_countables[] = {
+       COUNTABLE(PERF_RB_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STALL_CYCLES_CCU, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STALL_CYCLES_HLSQ, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STALL_CYCLES_FIFO0_FULL, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STALL_CYCLES_FIFO1_FULL, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STALL_CYCLES_FIFO2_FULL, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STARVE_CYCLES_SP, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STARVE_CYCLES_LRZ_TILE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STARVE_CYCLES_CCU, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STARVE_CYCLES_Z_PLANE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_STARVE_CYCLES_BARY_PLANE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_Z_WORKLOAD, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_HLSQ_ACTIVE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_Z_READ, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_Z_WRITE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_C_READ, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_C_WRITE, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_TOTAL_PASS, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_Z_PASS, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_Z_FAIL, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_S_FAIL, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_BLENDED_FXP_COMPONENTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_BLENDED_FP16_COMPONENTS, UINT64, AVERAGE),
+       COUNTABLE(RB_RESERVED, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_2D_ALIVE_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_2D_STALL_CYCLES_A2D, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SRC, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SP, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_2D_STARVE_CYCLES_DST, UINT64, AVERAGE),
+       COUNTABLE(PERF_RB_2D_VALID_PIXELS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter rbbm_counters[] = {
+//RESERVED: for kernel
+//     COUNTER(RBBM_PERFCTR_RBBM_SEL_0, RBBM_PERFCTR_RBBM_0_LO, RBBM_PERFCTR_RBBM_0_HI),
+       COUNTER(RBBM_PERFCTR_RBBM_SEL_1, RBBM_PERFCTR_RBBM_1_LO, RBBM_PERFCTR_RBBM_1_HI),
+       COUNTER(RBBM_PERFCTR_RBBM_SEL_2, RBBM_PERFCTR_RBBM_2_LO, RBBM_PERFCTR_RBBM_2_HI),
+       COUNTER(RBBM_PERFCTR_RBBM_SEL_3, RBBM_PERFCTR_RBBM_3_LO, RBBM_PERFCTR_RBBM_3_HI),
+};
+
+static const struct fd_perfcntr_countable rbbm_countables[] = {
+       COUNTABLE(PERF_RBBM_ALWAYS_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_ALWAYS_ON, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_TSE_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_RAS_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_PC_DCALL_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_PC_VSD_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_STATUS_MASKED, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_COM_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_DCOM_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_VBIF_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_VSC_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_TESS_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_UCHE_BUSY, UINT64, AVERAGE),
+       COUNTABLE(PERF_RBBM_HLSQ_BUSY, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter sp_counters[] = {
+//RESERVED: for kernel
+//     COUNTER(SP_PERFCTR_SP_SEL_0,  RBBM_PERFCTR_SP_0_LO,  RBBM_PERFCTR_SP_0_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_1,  RBBM_PERFCTR_SP_1_LO,  RBBM_PERFCTR_SP_1_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_2,  RBBM_PERFCTR_SP_2_LO,  RBBM_PERFCTR_SP_2_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_3,  RBBM_PERFCTR_SP_3_LO,  RBBM_PERFCTR_SP_3_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_4,  RBBM_PERFCTR_SP_4_LO,  RBBM_PERFCTR_SP_4_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_5,  RBBM_PERFCTR_SP_5_LO,  RBBM_PERFCTR_SP_5_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_6,  RBBM_PERFCTR_SP_6_LO,  RBBM_PERFCTR_SP_6_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_7,  RBBM_PERFCTR_SP_7_LO,  RBBM_PERFCTR_SP_7_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_8,  RBBM_PERFCTR_SP_8_LO,  RBBM_PERFCTR_SP_8_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_9,  RBBM_PERFCTR_SP_9_LO,  RBBM_PERFCTR_SP_9_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_10, RBBM_PERFCTR_SP_10_LO, RBBM_PERFCTR_SP_10_HI),
+       COUNTER(SP_PERFCTR_SP_SEL_11, RBBM_PERFCTR_SP_11_LO, RBBM_PERFCTR_SP_11_HI),
+};
+
+static const struct fd_perfcntr_countable sp_countables[] = {
+       COUNTABLE(PERF_SP_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_ALU_WORKING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_EFU_WORKING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_STALL_CYCLES_VPC, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_STALL_CYCLES_TP, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_STALL_CYCLES_RB, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_SCHEDULER_NON_WORKING, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_CONTEXTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_CONTEXT_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_WAVE_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_WAVE_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_DURATION_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_DURATION_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_CTRL_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_LOAD_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_EMIT_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_NOP_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_WAIT_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_FETCH_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_IDLE_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_END_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_LONG_SYNC_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_SHORT_SYNC_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_WAVE_JOIN_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_LM_LOAD_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_LM_STORE_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_LM_ATOMICS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_GM_LOAD_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_GM_STORE_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_GM_ATOMICS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_STAGE_BARY_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_VS_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_FS_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_ADDR_LOCK_COUNT, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_UCHE_READ_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_UCHE_WRITE_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_EXPORT_VPC_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_EXPORT_RB_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_PIXELS_KILLED, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_ICL1_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_ICL1_MISSES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_ICL0_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_ICL0_MISSES, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_HS_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_DS_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_GS_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_CS_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_GPR_READ, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_GPR_WRITE, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_LM_CH0_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_LM_CH1_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_SP_LM_BANK_CONFLICTS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter tp_counters[] = {
+       COUNTER(TPL1_PERFCTR_TP_SEL_0, RBBM_PERFCTR_TP_0_LO, RBBM_PERFCTR_TP_0_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_1, RBBM_PERFCTR_TP_1_LO, RBBM_PERFCTR_TP_1_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_2, RBBM_PERFCTR_TP_2_LO, RBBM_PERFCTR_TP_2_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_3, RBBM_PERFCTR_TP_3_LO, RBBM_PERFCTR_TP_3_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_4, RBBM_PERFCTR_TP_4_LO, RBBM_PERFCTR_TP_4_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_5, RBBM_PERFCTR_TP_5_LO, RBBM_PERFCTR_TP_5_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_6, RBBM_PERFCTR_TP_6_LO, RBBM_PERFCTR_TP_6_HI),
+       COUNTER(TPL1_PERFCTR_TP_SEL_7, RBBM_PERFCTR_TP_7_LO, RBBM_PERFCTR_TP_7_HI),
+};
+
+static const struct fd_perfcntr_countable tp_countables[] = {
+       COUNTABLE(PERF_TP_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_LATENCY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_LATENCY_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_SAMPLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_LATENCY, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_L1_CACHELINE_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_L1_CACHELINE_MISSES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_SP_TP_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_TP_SP_TRANS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_OUTPUT_PIXELS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_RECEIVED, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_OFFSET, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_SHADOW, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_ARRAY, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_GRADIENT, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_1D, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_2D, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_BUFFER, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_3D, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_QUADS_CUBE, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_STATE_CACHE_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_STATE_CACHE_MISSES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_DIVERGENT_QUADS_RECEIVED, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_BINDLESS_STATE_CACHE_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_BINDLESS_STATE_CACHE_MISSES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_PRT_NON_RESIDENT_EVENTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_OUTPUT_PIXELS_POINT, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_OUTPUT_PIXELS_MIP, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_OUTPUT_PIXELS_ANISO, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_FLAG_CACHE_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_FLAG_CACHE_MISSES, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_L1_5_L2_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_POINT, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE),
+       COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter uche_counters[] = {
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_0, RBBM_PERFCTR_UCHE_0_LO, RBBM_PERFCTR_UCHE_0_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_1, RBBM_PERFCTR_UCHE_1_LO, RBBM_PERFCTR_UCHE_1_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_2, RBBM_PERFCTR_UCHE_2_LO, RBBM_PERFCTR_UCHE_2_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_3, RBBM_PERFCTR_UCHE_3_LO, RBBM_PERFCTR_UCHE_3_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_4, RBBM_PERFCTR_UCHE_4_LO, RBBM_PERFCTR_UCHE_4_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_5, RBBM_PERFCTR_UCHE_5_LO, RBBM_PERFCTR_UCHE_5_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_6, RBBM_PERFCTR_UCHE_6_LO, RBBM_PERFCTR_UCHE_6_HI),
+       COUNTER(UCHE_PERFCTR_UCHE_SEL_7, RBBM_PERFCTR_UCHE_7_LO, RBBM_PERFCTR_UCHE_7_HI),
+};
+
+static const struct fd_perfcntr_countable uche_countables[] = {
+       COUNTABLE(PERF_UCHE_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_STALL_CYCLES_VBIF, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_LATENCY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_LATENCY_SAMPLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_TP, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_VFD, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_HLSQ, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_SP, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_READ_REQUESTS_TP, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_READ_REQUESTS_VFD, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_READ_REQUESTS_HLSQ, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_READ_REQUESTS_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_READ_REQUESTS_SP, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_WRITE_REQUESTS_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_WRITE_REQUESTS_SP, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VPC, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VSC, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_EVICTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ0, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ1, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ2, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ3, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ4, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ5, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ6, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_BANK_REQ7, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH0, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH1, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_GMEM_READ_BEATS, UINT64, AVERAGE),
+       COUNTABLE(PERF_UCHE_FLAG_COUNT, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter vfd_counters[] = {
+       COUNTER(VFD_PERFCTR_VFD_SEL_0, RBBM_PERFCTR_VFD_0_LO, RBBM_PERFCTR_VFD_0_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_1, RBBM_PERFCTR_VFD_1_LO, RBBM_PERFCTR_VFD_1_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_2, RBBM_PERFCTR_VFD_2_LO, RBBM_PERFCTR_VFD_2_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_3, RBBM_PERFCTR_VFD_3_LO, RBBM_PERFCTR_VFD_3_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_4, RBBM_PERFCTR_VFD_4_LO, RBBM_PERFCTR_VFD_4_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_5, RBBM_PERFCTR_VFD_5_LO, RBBM_PERFCTR_VFD_5_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_6, RBBM_PERFCTR_VFD_6_LO, RBBM_PERFCTR_VFD_6_HI),
+       COUNTER(VFD_PERFCTR_VFD_SEL_7, RBBM_PERFCTR_VFD_7_LO, RBBM_PERFCTR_VFD_7_HI),
+};
+
+static const struct fd_perfcntr_countable vfd_countables[] = {
+       COUNTABLE(PERF_VFD_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_VPC_ALLOC, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_MISS_VB, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_MISS_Q, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_SP_INFO, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_SP_ATTR, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_VFDP_VB, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STALL_CYCLES_VFDP_Q, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_DECODER_PACKER_STALL, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_STARVE_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_RBUFFER_FULL, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_ATTR_INFO_FIFO_FULL, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_DECODED_ATTRIBUTE_BYTES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_NUM_ATTRIBUTES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_INSTRUCTIONS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_UPPER_SHADER_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_LOWER_SHADER_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_MODE_0_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_MODE_1_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_MODE_2_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_MODE_3_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_MODE_4_FIBERS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_TOTAL_VERTICES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_NUM_ATTR_MISS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFD_1_BURST_REQ, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_INDEX, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_PROG, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFDP_STARVE_CYCLES_PC, UINT64, AVERAGE),
+       COUNTABLE(PERF_VFDP_VS_STAGE_32_WAVES, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter vpc_counters[] = {
+       COUNTER(VPC_PERFCTR_VPC_SEL_0, RBBM_PERFCTR_VPC_0_LO, RBBM_PERFCTR_VPC_0_HI),
+       COUNTER(VPC_PERFCTR_VPC_SEL_1, RBBM_PERFCTR_VPC_1_LO, RBBM_PERFCTR_VPC_1_HI),
+       COUNTER(VPC_PERFCTR_VPC_SEL_2, RBBM_PERFCTR_VPC_2_LO, RBBM_PERFCTR_VPC_2_HI),
+       COUNTER(VPC_PERFCTR_VPC_SEL_3, RBBM_PERFCTR_VPC_3_LO, RBBM_PERFCTR_VPC_3_HI),
+};
+
+static const struct fd_perfcntr_countable vpc_countables[] = {
+       COUNTABLE(PERF_VPC_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_WORKING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STALL_CYCLES_VFD_WACK, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STALL_CYCLES_HLSQ_PRIM_ALLOC, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STALL_CYCLES_PC, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STALL_CYCLES_SP_LM, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_POS_EXPORT_STALL_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STARVE_CYCLES_SP, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STARVE_CYCLES_LRZ, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_PC_PRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_SP_COMPONENTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_SP_LM_PRIMITIVES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_SP_LM_COMPONENTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_SP_LM_DWORDS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_STREAMOUT_COMPONENTS, UINT64, AVERAGE),
+       COUNTABLE(PERF_VPC_GRANT_PHASES, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter vsc_counters[] = {
+       COUNTER(VSC_PERFCTR_VSC_SEL_0, RBBM_PERFCTR_VSC_0_LO, RBBM_PERFCTR_VSC_0_HI),
+       COUNTER(VSC_PERFCTR_VSC_SEL_1, RBBM_PERFCTR_VSC_1_LO, RBBM_PERFCTR_VSC_1_HI),
+};
+
+static const struct fd_perfcntr_countable vsc_countables[] = {
+       COUNTABLE(PERF_VSC_BUSY_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VSC_WORKING_CYCLES, UINT64, AVERAGE),
+       COUNTABLE(PERF_VSC_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+       COUNTABLE(PERF_VSC_EOT_NUM, UINT64, AVERAGE),
+};
+
+/* VBIF counters probably not too userful for userspace, and they make
+ * frameretrace take many more passes to collect all the metrics, so
+ * for now let's hide them.
+ */
+#if 0
+/* VBIF counters break the pattern a bit, with enable and clear regs: */
+static const struct fd_perfcntr_counter vbif_counters[] = {
+       COUNTER2(VBIF_PERF_CNT_SEL0, VBIF_PERF_CNT_LOW0, VBIF_PERF_CNT_HIGH0, VBIF_PERF_CNT_EN0, VBIF_PERF_CNT_CLR0),
+       COUNTER2(VBIF_PERF_CNT_SEL1, VBIF_PERF_CNT_LOW1, VBIF_PERF_CNT_HIGH1, VBIF_PERF_CNT_EN1, VBIF_PERF_CNT_CLR1),
+       COUNTER2(VBIF_PERF_CNT_SEL2, VBIF_PERF_CNT_LOW2, VBIF_PERF_CNT_HIGH2, VBIF_PERF_CNT_EN2, VBIF_PERF_CNT_CLR2),
+       COUNTER2(VBIF_PERF_CNT_SEL3, VBIF_PERF_CNT_LOW3, VBIF_PERF_CNT_HIGH3, VBIF_PERF_CNT_EN3, VBIF_PERF_CNT_CLR3),
+};
+
+static const struct fd_perfcntr_countable vbif_countables[] = {
+       COUNTABLE(AXI_READ_REQUESTS_ID_0, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_1, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_2, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_3, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_4, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_5, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_6, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_7, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_8, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_9, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_10, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_11, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_12, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_13, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_14, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_ID_15, UINT64, AVERAGE),
+       COUNTABLE(AXI0_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI1_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI2_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI3_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_0, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_1, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_2, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_3, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_4, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_5, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_6, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_7, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_8, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_9, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_10, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_11, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_12, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_13, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_14, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_ID_15, UINT64, AVERAGE),
+       COUNTABLE(AXI0_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI1_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI2_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI3_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_TOTAL_REQUESTS, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_0, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_1, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_2, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_3, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_4, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_5, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_6, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_7, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_8, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_9, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_10, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_11, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_12, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_13, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_14, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_ID_15, UINT64, AVERAGE),
+       COUNTABLE(AXI0_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI1_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI2_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI3_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_0, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_1, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_2, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_3, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_4, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_5, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_6, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_7, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_8, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_9, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_10, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_11, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_12, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_13, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_14, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_ID_15, UINT64, AVERAGE),
+       COUNTABLE(AXI0_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI1_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI2_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI3_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+       COUNTABLE(AXI_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+};
+#endif
+
+const struct fd_perfcntr_group a5xx_perfcntr_groups[] = {
+       GROUP("CP", cp_counters, cp_countables),
+       GROUP("CCU", ccu_counters, ccu_countables),
+       GROUP("TSE", tse_counters, tse_countables),
+       GROUP("RAS", ras_counters, ras_countables),
+       GROUP("LRZ", lrz_counters, lrz_countables),
+       GROUP("HLSQ", hlsq_counters, hlsq_countables),
+       GROUP("PC", pc_counters, pc_countables),
+       GROUP("RB", rb_counters, rb_countables),
+       GROUP("RBBM", rbbm_counters, rbbm_countables),
+       GROUP("SP", sp_counters, sp_countables),
+       GROUP("TP", tp_counters, tp_countables),
+       GROUP("UCHE", uche_counters, uche_countables),
+       GROUP("VFD", vfd_counters, vfd_countables),
+       GROUP("VPC", vpc_counters, vpc_countables),
+       GROUP("VSC", vsc_counters, vsc_countables),
+//     GROUP("VBIF", vbif_counters, vbif_countables),
+};
+
+const unsigned a5xx_num_perfcntr_groups = ARRAY_SIZE(a5xx_perfcntr_groups);
+
+#endif /* FD5_PERFCNTR_H_ */
index 946076235bb43e184a8072f838c13f724443dd27..b438c7a5634570a604b6b118de2c363953278516 100644 (file)
@@ -39,11 +39,17 @@ struct PACKED fd5_query_sample {
        uint64_t stop;
 };
 
-#define query_sample(aq, field)                 \
+/* offset of a single field of an array of fd5_query_sample: */
+#define query_sample_idx(aq, idx, field)        \
        fd_resource((aq)->prsc)->bo,                \
+       (idx * sizeof(struct fd5_query_sample)) +   \
        offsetof(struct fd5_query_sample, field),   \
        0, 0
 
+/* offset of a single field of fd5_query_sample: */
+#define query_sample(aq, field)                 \
+       query_sample_idx(aq, 0, field)
+
 /*
  * Occlusion Query:
  *
@@ -246,6 +252,201 @@ static const struct fd_acc_sample_provider timestamp = {
                .result = timestamp_accumulate_result,
 };
 
+/*
+ * Performance Counter (batch) queries:
+ *
+ * Only one of these is active at a time, per design of the gallium
+ * batch_query API design.  On perfcntr query tracks N query_types,
+ * each of which has a 'fd_batch_query_entry' that maps it back to
+ * the associated group and counter.
+ */
+
+struct fd_batch_query_entry {
+       uint8_t gid;        /* group-id */
+       uint8_t cid;        /* countable-id within the group */
+};
+
+struct fd_batch_query_data {
+       struct fd_screen *screen;
+       unsigned num_query_entries;
+       struct fd_batch_query_entry query_entries[];
+};
+
+static void
+perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+       struct fd_batch_query_data *data = aq->query_data;
+       struct fd_screen *screen = data->screen;
+       struct fd_ringbuffer *ring = batch->draw;
+
+       unsigned counters_per_group[screen->num_perfcntr_groups];
+       memset(counters_per_group, 0, sizeof(counters_per_group));
+
+       fd_wfi(batch, ring);
+
+       /* configure performance counters for the requested queries: */
+       for (unsigned i = 0; i < data->num_query_entries; i++) {
+               struct fd_batch_query_entry *entry = &data->query_entries[i];
+               const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
+               unsigned counter_idx = counters_per_group[entry->gid]++;
+
+               debug_assert(counter_idx < g->num_counters);
+
+               OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
+               OUT_RING(ring, g->countables[entry->cid].selector);
+       }
+
+       memset(counters_per_group, 0, sizeof(counters_per_group));
+
+       /* and snapshot the start values */
+       for (unsigned i = 0; i < data->num_query_entries; i++) {
+               struct fd_batch_query_entry *entry = &data->query_entries[i];
+               const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
+               unsigned counter_idx = counters_per_group[entry->gid]++;
+               const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
+
+               OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+               OUT_RING(ring, CP_REG_TO_MEM_0_64B |
+                       CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
+               OUT_RELOCW(ring, query_sample_idx(aq, i, start));
+       }
+}
+
+static void
+perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+       struct fd_batch_query_data *data = aq->query_data;
+       struct fd_screen *screen = data->screen;
+       struct fd_ringbuffer *ring = batch->draw;
+
+       unsigned counters_per_group[screen->num_perfcntr_groups];
+       memset(counters_per_group, 0, sizeof(counters_per_group));
+
+       fd_wfi(batch, ring);
+
+       /* TODO do we need to bother to turn anything off? */
+
+       /* snapshot the end values: */
+       for (unsigned i = 0; i < data->num_query_entries; i++) {
+               struct fd_batch_query_entry *entry = &data->query_entries[i];
+               const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
+               unsigned counter_idx = counters_per_group[entry->gid]++;
+               const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
+
+               OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+               OUT_RING(ring, CP_REG_TO_MEM_0_64B |
+                       CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
+               OUT_RELOCW(ring, query_sample_idx(aq, i, stop));
+       }
+
+       /* and compute the result: */
+       for (unsigned i = 0; i < data->num_query_entries; i++) {
+               /* result += stop - start: */
+               OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+               OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+                               CP_MEM_TO_MEM_0_NEG_C);
+               OUT_RELOCW(ring, query_sample_idx(aq, i, result));     /* dst */
+               OUT_RELOC(ring, query_sample_idx(aq, i, result));      /* srcA */
+               OUT_RELOC(ring, query_sample_idx(aq, i, stop));        /* srcB */
+               OUT_RELOC(ring, query_sample_idx(aq, i, start));       /* srcC */
+       }
+}
+
+static void
+perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf,
+               union pipe_query_result *result)
+{
+       struct fd_batch_query_data *data = aq->query_data;
+       struct fd5_query_sample *sp = buf;
+
+       for (unsigned i = 0; i < data->num_query_entries; i++) {
+               result->batch[i].u64 = sp[i].result;
+       }
+}
+
+static const struct fd_acc_sample_provider perfcntr = {
+               .query_type = FD_QUERY_FIRST_PERFCNTR,
+               .active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
+               .resume = perfcntr_resume,
+               .pause = perfcntr_pause,
+               .result = perfcntr_accumulate_result,
+};
+
+static struct pipe_query *
+fd5_create_batch_query(struct pipe_context *pctx,
+               unsigned num_queries, unsigned *query_types)
+{
+       struct fd_context *ctx = fd_context(pctx);
+       struct fd_screen *screen = ctx->screen;
+       struct fd_query *q;
+       struct fd_acc_query *aq;
+       struct fd_batch_query_data *data;
+
+       data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data,
+                       num_queries * sizeof(data->query_entries[0]));
+
+       data->screen = screen;
+       data->num_query_entries = num_queries;
+
+       /* validate the requested query_types and ensure we don't try
+        * to request more query_types of a given group than we have
+        * counters:
+        */
+       unsigned counters_per_group[screen->num_perfcntr_groups];
+       memset(counters_per_group, 0, sizeof(counters_per_group));
+
+       for (unsigned i = 0; i < num_queries; i++) {
+               unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
+
+               /* verify valid query_type, ie. is it actually a perfcntr? */
+               if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
+                               (idx >= screen->num_perfcntr_queries)) {
+                       debug_printf("invalid batch query query_type: %u\n", query_types[i]);
+                       goto error;
+               }
+
+               struct fd_batch_query_entry *entry = &data->query_entries[i];
+               struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
+
+               entry->gid = pq->group_id;
+
+               /* the perfcntr_queries[] table flattens all the countables
+                * for each group in series, ie:
+                *
+                *   (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
+                *
+                * So to find the countable index just step back through the
+                * table to find the first entry with the same group-id.
+                */
+               while (pq > screen->perfcntr_queries) {
+                       pq--;
+                       if (pq->group_id == entry->gid)
+                               entry->cid++;
+               }
+
+               if (counters_per_group[entry->gid] >=
+                               screen->perfcntr_groups[entry->gid].num_counters) {
+                       debug_printf("too many counters for group %u\n", entry->gid);
+                       goto error;
+               }
+
+               counters_per_group[entry->gid]++;
+       }
+
+       q = fd_acc_create_query2(ctx, 0, &perfcntr);
+       aq = fd_acc_query(q);
+
+       /* sample buffer size is based on # of queries: */
+       aq->size = num_queries * sizeof(struct fd5_query_sample);
+       aq->query_data = data;
+
+       return (struct pipe_query *)q;
+
+error:
+       free(data);
+       return NULL;
+}
+
 void
 fd5_query_context_init(struct pipe_context *pctx)
 {
@@ -254,6 +455,8 @@ fd5_query_context_init(struct pipe_context *pctx)
        ctx->create_query = fd_acc_create_query;
        ctx->query_set_stage = fd_acc_query_set_stage;
 
+       pctx->create_batch_query = fd5_create_batch_query;
+
        fd_acc_query_register_provider(pctx, &occlusion_counter);
        fd_acc_query_register_provider(pctx, &occlusion_predicate);
        fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
index f441b0cc29f6a09e16feb37226fac70a13a65a45..37095be75361f3500080775ff85e5a5b2e2b7982 100644 (file)
@@ -116,6 +116,9 @@ fd5_screen_is_format_supported(struct pipe_screen *pscreen,
        return retval == usage;
 }
 
+extern const struct fd_perfcntr_group a5xx_perfcntr_groups[];
+extern const unsigned a5xx_num_perfcntr_groups;
+
 void
 fd5_screen_init(struct pipe_screen *pscreen)
 {
@@ -128,4 +131,9 @@ fd5_screen_init(struct pipe_screen *pscreen)
        screen->setup_slices = fd5_setup_slices;
        if (fd_mesa_debug & FD_DBG_TTILE)
                screen->tile_mode = fd5_tile_mode;
+
+       if (fd_mesa_debug & FD_DBG_PERFC) {
+               screen->perfcntr_groups = a5xx_perfcntr_groups;
+               screen->num_perfcntr_groups = a5xx_num_perfcntr_groups;
+       }
 }
index 531644c85e8423ded131c8b52ffd793403ea776f..65ad64c9efa13be441326699f9cbf3128577d9c0 100644 (file)
@@ -84,6 +84,7 @@ static const struct debug_named_value debug_options[] = {
                {"noblit",    FD_DBG_NOBLIT, "Disable blitter (fallback to generic blit path)"},
                {"hiprio",    FD_DBG_HIPRIO, "Force high-priority context"},
                {"ttile",     FD_DBG_TTILE,  "Enable texture tiling (a5xx)"},
+               {"perfcntrs", FD_DBG_PERFC,  "Expose performance counters"},
                DEBUG_NAMED_VALUE_END
 };
 
index d8dec0151d05c7a5dd7d0fbf847fc1a74485780c..10151aaa9e7335edb48d3231f635813b7ad6bbf7 100644 (file)
@@ -84,6 +84,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_NOBLIT  0x80000
 #define FD_DBG_HIPRIO 0x100000
 #define FD_DBG_TTILE  0x200000
+#define FD_DBG_PERFC  0x400000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
index 218756f542ac11a0794369fa45e613b85b9f9def..5cc19755ddccb165fdc550ff8ac8ecdc133d0f99 100644 (file)
@@ -169,6 +169,7 @@ files_libfreedreno = files(
   'a5xx/fd5_gmem.h',
   'a5xx/fd5_image.c',
   'a5xx/fd5_image.h',
+  'a5xx/fd5_perfcntr.c',
   'a5xx/fd5_program.c',
   'a5xx/fd5_program.h',
   'a5xx/fd5_query.c',