ac: import lp_create_builder() from gallivm
[mesa.git] / src / gallium / drivers / radeonsi / si_perfcounter.c
index 7ee1daee7bfff541b017afe55363c53ea25dac91..1cf004dff83b8827c8d52d2d41998585a072dfb6 100644 (file)
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors:
- *  Nicolai Hähnle <nicolai.haehnle@amd.com>
- *
  */
 
 #include "radeon/r600_cs.h"
 #include "radeon/r600_query.h"
-#include "radeon/r600_pipe_common.h"
 #include "util/u_memory.h"
 
 #include "si_pipe.h"
@@ -56,6 +51,8 @@ enum si_pc_reg_layout {
 
        /* Registers are laid out in decreasing rather than increasing order. */
        SI_PC_REG_REVERSE = 4,
+
+       SI_PC_FAKE = 8,
 };
 
 struct si_pc_block_base {
@@ -79,6 +76,23 @@ struct si_pc_block {
        unsigned instances;
 };
 
+/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
+ * performance counter group IDs.
+ */
+static const char * const si_pc_shader_type_suffixes[] = {
+       "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
+};
+
+static const unsigned si_pc_shader_type_bits[] = {
+       0x7f,
+       S_036780_ES_EN(1),
+       S_036780_GS_EN(1),
+       S_036780_VS_EN(1),
+       S_036780_PS_EN(1),
+       S_036780_LS_EN(1),
+       S_036780_HS_EN(1),
+       S_036780_CS_EN(1),
+};
 
 static struct si_pc_block_base cik_CB = {
        .name = "CB",
@@ -189,6 +203,7 @@ static struct si_pc_block_base cik_PA_SC = {
        .layout = SI_PC_MULTI_ALTERNATE,
 };
 
+/* According to docs, PA_SU counters are only 48 bits wide. */
 static struct si_pc_block_base cik_PA_SU = {
        .name = "PA_SU",
        .num_counters = 4,
@@ -308,56 +323,104 @@ static struct si_pc_block_base cik_WD = {
        .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
 };
 
+static struct si_pc_block_base cik_MC = {
+       .name = "MC",
+       .num_counters = 4,
+
+       .layout = SI_PC_FAKE,
+};
+
+static struct si_pc_block_base cik_SRBM = {
+       .name = "SRBM",
+       .num_counters = 2,
+
+       .layout = SI_PC_FAKE,
+};
+
 /* Both the number of instances and selectors varies between chips of the same
  * class. We only differentiate by class here and simply expose the maximum
  * number over all chips in a class.
+ *
+ * Unfortunately, GPUPerfStudio uses the order of performance counter groups
+ * blindly once it believes it has identified the hardware, so the order of
+ * blocks here matters.
  */
 static struct si_pc_block groups_CIK[] = {
        { &cik_CB, 226, 4 },
-       { &cik_CPC, 22 },
        { &cik_CPF, 17 },
-       { &cik_CPG, 46 },
        { &cik_DB, 257, 4 },
-       { &cik_GDS, 121 },
        { &cik_GRBM, 34 },
        { &cik_GRBMSE, 15 },
-       { &cik_IA, 22 },
-       { &cik_PA_SC, 395 },
        { &cik_PA_SU, 153 },
+       { &cik_PA_SC, 395 },
        { &cik_SPI, 186 },
        { &cik_SQ, 252 },
        { &cik_SX, 32 },
        { &cik_TA, 111, 11 },
        { &cik_TCA, 39, 2 },
        { &cik_TCC, 160, 16 },
-       { &cik_TCP, 154, 11 },
        { &cik_TD, 55, 11 },
+       { &cik_TCP, 154, 11 },
+       { &cik_GDS, 121 },
        { &cik_VGT, 140 },
+       { &cik_IA, 22 },
+       { &cik_MC, 22 },
+       { &cik_SRBM, 19 },
        { &cik_WD, 22 },
+       { &cik_CPG, 46 },
+       { &cik_CPC, 22 },
+
 };
 
 static struct si_pc_block groups_VI[] = {
-       { &cik_CB, 396, 4 },
-       { &cik_CPC, 24 },
+       { &cik_CB, 405, 4 },
        { &cik_CPF, 19 },
-       { &cik_CPG, 48 },
        { &cik_DB, 257, 4 },
-       { &cik_GDS, 121 },
        { &cik_GRBM, 34 },
        { &cik_GRBMSE, 15 },
-       { &cik_IA, 24 },
-       { &cik_PA_SC, 397 },
        { &cik_PA_SU, 153 },
+       { &cik_PA_SC, 397 },
        { &cik_SPI, 197 },
        { &cik_SQ, 273 },
        { &cik_SX, 34 },
        { &cik_TA, 119, 16 },
        { &cik_TCA, 35, 2 },
        { &cik_TCC, 192, 16 },
-       { &cik_TCP, 180, 16 },
        { &cik_TD, 55, 16 },
+       { &cik_TCP, 180, 16 },
+       { &cik_GDS, 121 },
        { &cik_VGT, 147 },
+       { &cik_IA, 24 },
+       { &cik_MC, 22 },
+       { &cik_SRBM, 27 },
        { &cik_WD, 37 },
+       { &cik_CPG, 48 },
+       { &cik_CPC, 24 },
+
+};
+
+static struct si_pc_block groups_gfx9[] = {
+       { &cik_CB, 438, 4 },
+       { &cik_CPF, 32 },
+       { &cik_DB, 328, 4 },
+       { &cik_GRBM, 38 },
+       { &cik_GRBMSE, 16 },
+       { &cik_PA_SU, 292 },
+       { &cik_PA_SC, 491 },
+       { &cik_SPI, 196 },
+       { &cik_SQ, 374 },
+       { &cik_SX, 208 },
+       { &cik_TA, 119, 16 },
+       { &cik_TCA, 35, 2 },
+       { &cik_TCC, 256, 16 },
+       { &cik_TD, 57, 16 },
+       { &cik_TCP, 85, 16 },
+       { &cik_GDS, 121 },
+       { &cik_VGT, 148 },
+       { &cik_IA, 32 },
+       { &cik_WD, 58 },
+       { &cik_CPG, 59 },
+       { &cik_CPC, 35 },
 };
 
 static void si_pc_get_size(struct r600_perfcounter_block *group,
@@ -368,7 +431,9 @@ static void si_pc_get_size(struct r600_perfcounter_block *group,
        struct si_pc_block_base *regs = sigroup->b;
        unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
 
-       if (layout_multi == SI_PC_MULTI_BLOCK) {
+       if (regs->layout & SI_PC_FAKE) {
+               *num_select_dw = 0;
+       } else if (layout_multi == SI_PC_MULTI_BLOCK) {
                if (count < regs->num_multi)
                        *num_select_dw = 2 * (count + 2) + regs->num_prelude;
                else
@@ -431,6 +496,9 @@ static void si_pc_emit_select(struct r600_common_context *ctx,
 
        assert(count <= regs->num_counters);
 
+       if (regs->layout & SI_PC_FAKE)
+               return;
+
        if (layout_multi == SI_PC_MULTI_BLOCK) {
                assert(!(regs->layout & SI_PC_REG_REVERSE));
 
@@ -530,7 +598,7 @@ static void si_pc_emit_start(struct r600_common_context *ctx,
        radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
                               S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_START) | EVENT_INDEX(0));
+       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
        radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
                               S_036020_PERFMON_STATE(V_036020_START_COUNTING));
 }
@@ -542,37 +610,15 @@ static void si_pc_emit_stop(struct r600_common_context *ctx,
 {
        struct radeon_winsys_cs *cs = ctx->gfx.cs;
 
-       if (ctx->screen->chip_class == CIK) {
-               /* Workaround for cache flush problems: send two EOP events. */
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-               radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
-                               EVENT_INDEX(5));
-               radeon_emit(cs, va);
-               radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
-               radeon_emit(cs, 0); /* immediate data */
-               radeon_emit(cs, 0); /* unused */
-       }
-
-       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
-                       EVENT_INDEX(5));
-       radeon_emit(cs, va);
-       radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
-       radeon_emit(cs, 0); /* immediate data */
-       radeon_emit(cs, 0); /* unused */
-
-       radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-       radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
-       radeon_emit(cs, va);
-       radeon_emit(cs, va >> 32);
-       radeon_emit(cs, 0); /* reference value */
-       radeon_emit(cs, 0xffffffff); /* mask */
-       radeon_emit(cs, 4); /* poll interval */
+       si_gfx_write_event_eop(ctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+                                EOP_DATA_SEL_VALUE_32BIT,
+                                buffer, va, 0, SI_NOT_QUERY);
+       si_gfx_wait_fence(ctx, va, 0, 0xffffffff);
 
        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
        radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
                               S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
                               S_036020_PERFMON_SAMPLE_ENABLE(1));
@@ -590,29 +636,44 @@ static void si_pc_emit_read(struct r600_common_context *ctx,
        unsigned reg = regs->counter0_lo;
        unsigned reg_delta = 8;
 
-       if (regs->layout & SI_PC_REG_REVERSE)
-               reg_delta = -reg_delta;
-
-       for (idx = 0; idx < count; ++idx) {
-               if (regs->counters)
-                       reg = regs->counters[idx];
-
-               radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-               radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-                               COPY_DATA_DST_SEL(COPY_DATA_MEM));
-               radeon_emit(cs, reg >> 2);
-               radeon_emit(cs, 0); /* unused */
-               radeon_emit(cs, va);
-               radeon_emit(cs, va >> 32);
-               va += 4;
-               reg += reg_delta;
+       if (!(regs->layout & SI_PC_FAKE)) {
+               if (regs->layout & SI_PC_REG_REVERSE)
+                       reg_delta = -reg_delta;
+
+               for (idx = 0; idx < count; ++idx) {
+                       if (regs->counters)
+                               reg = regs->counters[idx];
+
+                       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+                       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
+                                       COPY_DATA_DST_SEL(COPY_DATA_MEM) |
+                                       COPY_DATA_COUNT_SEL); /* 64 bits */
+                       radeon_emit(cs, reg >> 2);
+                       radeon_emit(cs, 0); /* unused */
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+                       va += sizeof(uint64_t);
+                       reg += reg_delta;
+               }
+       } else {
+               for (idx = 0; idx < count; ++idx) {
+                       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+                       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
+                                       COPY_DATA_DST_SEL(COPY_DATA_MEM) |
+                                       COPY_DATA_COUNT_SEL);
+                       radeon_emit(cs, 0); /* immediate */
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, va);
+                       radeon_emit(cs, va >> 32);
+                       va += sizeof(uint64_t);
+               }
        }
 }
 
-static void si_pc_cleanup(struct r600_common_screen *rscreen)
+static void si_pc_cleanup(struct si_screen *sscreen)
 {
-       r600_perfcounters_do_destroy(rscreen->perfcounters);
-       rscreen->perfcounters = NULL;
+       si_perfcounters_do_destroy(sscreen->perfcounters);
+       sscreen->perfcounters = NULL;
 }
 
 void si_init_perfcounters(struct si_screen *screen)
@@ -622,7 +683,7 @@ void si_init_perfcounters(struct si_screen *screen)
        unsigned num_blocks;
        unsigned i;
 
-       switch (screen->b.chip_class) {
+       switch (screen->info.chip_class) {
        case CIK:
                blocks = groups_CIK;
                num_blocks = ARRAY_SIZE(groups_CIK);
@@ -631,16 +692,20 @@ void si_init_perfcounters(struct si_screen *screen)
                blocks = groups_VI;
                num_blocks = ARRAY_SIZE(groups_VI);
                break;
+       case GFX9:
+               blocks = groups_gfx9;
+               num_blocks = ARRAY_SIZE(groups_gfx9);
+               break;
        case SI:
        default:
                return; /* not implemented */
        }
 
-       if (screen->b.info.max_sh_per_se != 1) {
+       if (screen->info.max_sh_per_se != 1) {
                /* This should not happen on non-SI chips. */
                fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
                        "supported (inaccurate performance counters)\n",
-                       screen->b.info.max_sh_per_se);
+                       screen->info.max_sh_per_se);
        }
 
        pc = CALLOC_STRUCT(r600_perfcounters);
@@ -648,13 +713,13 @@ void si_init_perfcounters(struct si_screen *screen)
                return;
 
        pc->num_start_cs_dwords = 14;
-       pc->num_stop_cs_dwords = 20;
+       pc->num_stop_cs_dwords = 14 + si_gfx_write_fence_dwords(screen);
        pc->num_instance_cs_dwords = 3;
        pc->num_shaders_cs_dwords = 4;
 
-       if (screen->b.chip_class == CIK) {
-               pc->num_stop_cs_dwords += 6;
-       }
+       pc->num_shader_types = ARRAY_SIZE(si_pc_shader_type_bits);
+       pc->shader_type_suffixes = si_pc_shader_type_suffixes;
+       pc->shader_type_bits = si_pc_shader_type_bits;
 
        pc->get_size = si_pc_get_size;
        pc->emit_instance = si_pc_emit_instance;
@@ -665,7 +730,7 @@ void si_init_perfcounters(struct si_screen *screen)
        pc->emit_read = si_pc_emit_read;
        pc->cleanup = si_pc_cleanup;
 
-       if (!r600_perfcounters_init(pc, num_blocks))
+       if (!si_perfcounters_init(pc, num_blocks))
                goto error;
 
        for (i = 0; i < num_blocks; ++i) {
@@ -673,11 +738,11 @@ void si_init_perfcounters(struct si_screen *screen)
                unsigned instances = block->instances;
 
                if (!strcmp(block->b->name, "IA")) {
-                       if (screen->b.info.max_se > 2)
+                       if (screen->info.max_se > 2)
                                instances = 2;
                }
 
-               r600_perfcounters_add_block(&screen->b, pc,
+               si_perfcounters_add_block(screen, pc,
                                            block->b->name,
                                            block->b->flags,
                                            block->b->num_counters,
@@ -686,9 +751,9 @@ void si_init_perfcounters(struct si_screen *screen)
                                            block);
        }
 
-       screen->b.perfcounters = pc;
+       screen->perfcounters = pc;
        return;
 
 error:
-       r600_perfcounters_do_destroy(pc);
+       si_perfcounters_do_destroy(pc);
 }