radeonsi/gfx10: implement most performance counters
authorMarek Olšák <marek.olsak@amd.com>
Fri, 22 May 2020 12:30:31 +0000 (08:30 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 26 May 2020 10:00:54 +0000 (06:00 -0400)
PAL has all of them.

GE perf counters don't work - no idea why.
I only tested the few that I like to use.

There is no documentation, though most of the enums had already been
in the headers.

Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5184>

src/gallium/drivers/radeonsi/si_perfcounter.c

index a1a4eca1965c92fdcf2c5bfb486251a95c399b7c..d6b3fc85767679b1c456dc51c3a8ec3607253db5 100644 (file)
@@ -49,16 +49,40 @@ enum si_pc_reg_layout
 {
    /* All secondary selector dwords follow as one block after the primary
     * selector dwords for the counters that have secondary selectors.
+    *
+    * Example:
+    *    PERFCOUNTER0_SELECT
+    *    PERFCOUNTER1_SELECT
+    *    PERFCOUNTER0_SELECT1
+    *    PERFCOUNTER1_SELECT1
+    *    PERFCOUNTER2_SELECT
+    *    PERFCOUNTER3_SELECT
     */
    SI_PC_MULTI_BLOCK = 0,
 
-   /* Each secondary selector dword follows immediately afters the
+   /* Each secondary selector dword follows immediately after the
     * corresponding primary.
+    *
+    * Example:
+    *    PERFCOUNTER0_SELECT
+    *    PERFCOUNTER0_SELECT1
+    *    PERFCOUNTER1_SELECT
+    *    PERFCOUNTER1_SELECT1
+    *    PERFCOUNTER2_SELECT
+    *    PERFCOUNTER3_SELECT
     */
    SI_PC_MULTI_ALTERNATE = 1,
 
    /* All secondary selector dwords follow as one block after all primary
     * selector dwords.
+    *
+    * Example:
+    *    PERFCOUNTER0_SELECT
+    *    PERFCOUNTER1_SELECT
+    *    PERFCOUNTER2_SELECT
+    *    PERFCOUNTER3_SELECT
+    *    PERFCOUNTER0_SELECT1
+    *    PERFCOUNTER1_SELECT1
     */
    SI_PC_MULTI_TAIL = 2,
 
@@ -399,6 +423,168 @@ static struct si_pc_block_base cik_SRBM = {
    .layout = SI_PC_FAKE,
 };
 
+static struct si_pc_block_base gfx10_CHA = {
+   .name = "CHA",
+   .num_counters = 4,
+
+   .select0 = R_037780_CHA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_CHCG = {
+   .name = "CHCG",
+   .num_counters = 4,
+
+   .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_CHC = {
+   .name = "CHC",
+   .num_counters = 4,
+
+   .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_GCR = {
+   .name = "GCR",
+   .num_counters = 2,
+
+   .select0 = R_037580_GCR_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_GE = {
+   .name = "GE",
+   .num_counters = 12,
+
+   .select0 = R_036200_GE_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034200_GE_PERFCOUNTER0_LO,
+   .num_multi = 4,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_GL1A = {
+   .name = "GL1A",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_GL1C = {
+   .name = "GL1C",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_GL2A = {
+   .name = "GL2A",
+   .num_counters = 4,
+
+   .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_GL2C = {
+   .name = "GL2C",
+   .num_counters = 4,
+
+   .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static unsigned gfx10_PA_PH_select[] = {
+   R_037600_PA_PH_PERFCOUNTER0_SELECT,
+   R_037604_PA_PH_PERFCOUNTER0_SELECT1,
+   R_037608_PA_PH_PERFCOUNTER1_SELECT,
+   R_037640_PA_PH_PERFCOUNTER1_SELECT1,
+   R_03760C_PA_PH_PERFCOUNTER2_SELECT,
+   R_037644_PA_PH_PERFCOUNTER2_SELECT1,
+   R_037610_PA_PH_PERFCOUNTER3_SELECT,
+   R_037648_PA_PH_PERFCOUNTER3_SELECT1,
+   R_037614_PA_PH_PERFCOUNTER4_SELECT,
+   R_037618_PA_PH_PERFCOUNTER5_SELECT,
+   R_03761C_PA_PH_PERFCOUNTER6_SELECT,
+   R_037620_PA_PH_PERFCOUNTER7_SELECT,
+};
+static struct si_pc_block_base gfx10_PA_PH = {
+   .name = "PA_PH",
+   .num_counters = 8,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select = gfx10_PA_PH_select,
+   .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO,
+   .num_multi = 4,
+   .layout = SI_PC_MULTI_CUSTOM,
+};
+
+static struct si_pc_block_base gfx10_PA_SU = {
+   .name = "PA_SU",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
+   .num_multi = 4,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_RLC = {
+   .name = "RLC",
+   .num_counters = 2,
+
+   .select0 = R_037304_RLC_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO,
+   .num_multi = 0,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_RMI = {
+   .name = "RMI",
+   /* Actually 4, but the 2nd counter is missing the secondary selector while
+    * the 3rd counter has it, which complicates the register layout. */
+   .num_counters = 2,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_037400_RMI_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base gfx10_UTCL1 = {
+   .name = "UTCL1",
+   .num_counters = 2,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO,
+   .num_multi = 0,
+   .layout = SI_PC_MULTI_ALTERNATE,
+};
+
 /* Both the number of instances and selectors varies between chips of the same
  * class. We only differentiate by class here and simply expose the maximum
  * number over all chips in a class.
@@ -433,6 +619,38 @@ static struct si_pc_block_gfxdescr groups_gfx9[] = {
    {&cik_CPC, 35},
 };
 
+static struct si_pc_block_gfxdescr groups_gfx10[] = {
+   {&cik_CB, 461},
+   {&gfx10_CHA, 45},
+   {&gfx10_CHCG, 35},
+   {&gfx10_CHC, 35},
+   {&cik_CPC, 47},
+   {&cik_CPF, 40},
+   {&cik_CPG, 82},
+   {&cik_DB, 370},
+   {&gfx10_GCR, 94},
+   {&cik_GDS, 123},
+   {&gfx10_GE, 315},
+   {&gfx10_GL1A, 36},
+   {&gfx10_GL1C, 64},
+   {&gfx10_GL2A, 91},
+   {&gfx10_GL2C, 235},
+   {&cik_GRBM, 47},
+   {&cik_GRBMSE, 19},
+   {&gfx10_PA_PH, 960},
+   {&cik_PA_SC, 552},
+   {&gfx10_PA_SU, 266},
+   {&gfx10_RLC, 7},
+   {&gfx10_RMI, 258},
+   {&cik_SPI, 329},
+   {&cik_SQ, 509},
+   {&cik_SX, 225},
+   {&cik_TA, 226},
+   {&cik_TCP, 77},
+   {&cik_TD, 61},
+   {&gfx10_UTCL1, 15},
+};
+
 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
                                           const struct si_pc_block *block)
 {
@@ -494,6 +712,11 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
       value |= S_030800_SE_BROADCAST_WRITES(1);
    }
 
+   if (sctx->chip_class >= GFX10) {
+      /* TODO: Expose counters from each shader array separately if needed. */
+      value |= S_030800_SA_BROADCAST_WRITES(1);
+   }
+
    if (instance >= 0) {
       value |= S_030800_INSTANCE_INDEX(instance);
    } else {
@@ -1214,19 +1437,15 @@ void si_init_perfcounters(struct si_screen *screen)
       blocks = groups_gfx9;
       num_blocks = ARRAY_SIZE(groups_gfx9);
       break;
+   case GFX10:
+      blocks = groups_gfx10;
+      num_blocks = ARRAY_SIZE(groups_gfx10);
+      break;
    case GFX6:
    default:
       return; /* not implemented */
    }
 
-   if (screen->info.max_sh_per_se != 1) {
-      /* This should not happen on non-GFX6 chips. */
-      fprintf(stderr,
-              "si_init_perfcounters: max_sh_per_se = %d not "
-              "supported (inaccurate performance counters)\n",
-              screen->info.max_sh_per_se);
-   }
-
    screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
    if (!pc)
       return;
@@ -1247,7 +1466,9 @@ void si_init_perfcounters(struct si_screen *screen)
       block->b = &blocks[i];
       block->num_instances = MAX2(1, block->b->instances);
 
-      if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB"))
+      if (!strcmp(block->b->b->name, "CB") ||
+          !strcmp(block->b->b->name, "DB") ||
+          !strcmp(block->b->b->name, "RMI"))
          block->num_instances = screen->info.max_se;
       else if (!strcmp(block->b->b->name, "TCC"))
          block->num_instances = screen->info.num_tcc_blocks;