From ee624ced364bfd2f896809874ef3a808a11c5ecf Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 27 Mar 2013 23:39:06 +0100 Subject: [PATCH] nvc0: implement MP performance counters There's more, but this only adds (most) of the counters that are handled directly by the shader processors. The other counter domains are not handled on the multiprocessor and there are no FIFO object methods for configuring them. Instead, they have to be programmed by the kernel via PCOUNTER, and the interface for this isn't in place yet. --- src/gallium/drivers/nvc0/nvc0_context.h | 3 +- src/gallium/drivers/nvc0/nvc0_program.c | 6 +- src/gallium/drivers/nvc0/nvc0_query.c | 393 +++++++++++++++++++- src/gallium/drivers/nvc0/nvc0_screen.c | 6 + src/gallium/drivers/nvc0/nvc0_screen.h | 71 ++++ src/gallium/drivers/nvc0/nvc0_winsys.h | 2 + src/gallium/drivers/nvc0/nve4_compute.c | 11 +- src/gallium/drivers/nvc0/nve4_compute.xml.h | 70 ++++ 8 files changed, 556 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/nvc0/nvc0_context.h b/src/gallium/drivers/nvc0/nvc0_context.h index d9aa3788cb0..799d9b9460a 100644 --- a/src/gallium/drivers/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nvc0/nvc0_context.h @@ -84,7 +84,8 @@ #define NVC0_BIND_CP_GLOBAL 49 #define NVC0_BIND_CP_DESC 50 #define NVC0_BIND_CP_SCREEN 51 -#define NVC0_BIND_CP_COUNT 52 +#define NVC0_BIND_CP_QUERY 52 +#define NVC0_BIND_CP_COUNT 53 /* bufctx for other operations */ #define NVC0_BIND_2D 0 diff --git a/src/gallium/drivers/nvc0/nvc0_program.c b/src/gallium/drivers/nvc0/nvc0_program.c index 22dfaf91aa2..15f683c5295 100644 --- a/src/gallium/drivers/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nvc0/nvc0_program.c @@ -777,8 +777,8 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog) if (prog->mem) nouveau_heap_free(&prog->mem); - - FREE(prog->code); + if (prog->code) + FREE(prog->code); /* may be 0 for hardcoded shaders */ FREE(prog->immd_data); FREE(prog->relocs); if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms) @@ -807,5 +807,5 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label) for (i = 0; i < prog->cp.num_syms; ++i) if (syms[i].label == label) return prog->code_base + base + syms[i].offset; - return ~0; + return prog->code_base; /* no symbols or symbol not found */ } diff --git a/src/gallium/drivers/nvc0/nvc0_query.c b/src/gallium/drivers/nvc0/nvc0_query.c index d329148de62..5c4431e5af5 100644 --- a/src/gallium/drivers/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nvc0/nvc0_query.c @@ -26,6 +26,7 @@ #include "nvc0_context.h" #include "nouveau/nv_object.xml.h" +#include "nve4_compute.xml.h" #define NVC0_QUERY_STATE_READY 0 #define NVC0_QUERY_STATE_ACTIVE 1 @@ -36,6 +37,7 @@ struct nvc0_query { uint32_t *data; uint16_t type; uint16_t index; + int8_t ctr[4]; uint32_t sequence; struct nouveau_bo *bo; uint32_t base; @@ -49,6 +51,11 @@ struct nvc0_query { #define NVC0_QUERY_ALLOC_SPACE 256 +static void nve4_mp_pm_query_begin(struct nvc0_context *, struct nvc0_query *); +static void nve4_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *); +static boolean nve4_mp_pm_query_result(struct nvc0_context *, + struct nvc0_query *, void *, boolean); + static INLINE struct nvc0_query * nvc0_query(struct pipe_query *pipe) { @@ -132,6 +139,16 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type) space = 16; break; default: + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS && + nvc0->screen->base.device->drm_version >= 0x01000101) { + if (type >= NVE4_PM_QUERY(0) && + type <= NVE4_PM_QUERY_MAX) { + /* 8 counters per MP + clock */ + space = 12 * nvc0->screen->mp_count * sizeof(uint32_t); + break; + } + } + debug_printf("invalid query type: %u\n", type); FREE(q); return NULL; } @@ -244,6 +261,8 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq) nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ break; default: + if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX) + nve4_mp_pm_query_begin(nvc0, q); break; } q->state = NVC0_QUERY_STATE_ACTIVE; @@ -314,7 +333,8 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq) nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); break; default: - assert(0); + if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX) + nve4_mp_pm_query_end(nvc0, q); break; } } @@ -343,6 +363,9 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, uint64_t *data64 = (uint64_t *)q->data; unsigned i; + if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX) + return nve4_mp_pm_query_result(nvc0, q, result, wait); + if (q->state != NVC0_QUERY_STATE_READY) nvc0_query_update(nvc0->screen->base.client, q); @@ -399,6 +422,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, res32[0] = q->data[1]; break; default: + assert(0); /* can't happen, we don't create queries with invalid type */ return FALSE; } @@ -513,6 +537,373 @@ nvc0_so_target_save_offset(struct pipe_context *pipe, nvc0_query_end(pipe, targ->pq); } + +/* === PERFORMANCE MONITORING COUNTERS === */ + +/* Code to read out MP counters: They are accessible via mmio, too, but let's + * just avoid mapping registers in userspace. We'd have to know which MPs are + * enabled/present, too, and that information is not presently exposed. + * We could add a kernel interface for it, but reading the counters like this + * has the advantage of being async (if get_result isn't called immediately). + */ +static const uint64_t nve4_read_mp_pm_counters_code[] = +{ + 0x2042004270420047ULL, /* sched */ + 0x2800400000001de4ULL, /* mov b32 $r0 c0[0] (04) */ + 0x2c0000000c009c04ULL, /* mov b32 $r2 $physid (20) */ + 0x2800400010005de4ULL, /* mov b32 $r1 c0[4] (04) */ + 0x2c0000008400dc04ULL, /* mov b32 $r3 $tidx (27) */ + 0x7000c01050209c03ULL, /* ext u32 $r2 $r2 0x0414 (04) */ + 0x2c00000010011c04ULL, /* mov b32 $r4 $pm0 (20) */ + 0x190e0000fc33dc03ULL, /* set $p1 eq u32 $r3 0 (04) */ + 0x2280428042804277ULL, /* sched */ + 0x2c00000014015c04ULL, /* mov b32 $r5 $pm1 (27) */ + 0x10000000c0209c02ULL, /* mul $r2 u32 $r2 u32 48 (04) */ + 0x2c00000018019c04ULL, /* mov b32 $r6 $pm2 (28) */ + 0x4801000008001c03ULL, /* add b32 ($r0 $c) $r0 $r2 (04) */ + 0x2c0000001c01dc04ULL, /* mov b32 $r7 $pm3 (28) */ + 0x0800000000105c42ULL, /* add b32 $r1 $r1 0 $c (04) */ + 0x2c00000140009c04ULL, /* mov b32 $r2 $clock (28) */ + 0x2042804200420047ULL, /* sched */ + 0x94000000000107c5ULL, /* $p1 st b128 wt g[$r0d] $r4q (04) */ + 0x2c00000020011c04ULL, /* mov b32 $r4 $pm4 (20) */ + 0x2c00000024015c04ULL, /* mov b32 $r5 $pm5 (04) */ + 0x2c00000028019c04ULL, /* mov b32 $r6 $pm6 (20) */ + 0x2c0000002c01dc04ULL, /* mov b32 $r7 $pm7 (04) */ + 0x2c0000014400dc04ULL, /* mov b32 $r3 $clockhi (28) */ + 0x94000000400107c5ULL, /* $p1 st b128 wt g[$r0d+16] $r4q (04) */ + 0x200002e042804207ULL, /* sched */ + 0x2800400020011de4ULL, /* mov b32 $r4 c0[8] (20) */ + 0x2c0000000c015c04ULL, /* mov b32 $r5 $physid (04) */ + 0x94000000800087a5ULL, /* $p1 st b64 wt g[$r0d+32] $r2d (28) */ + 0x94000000a00107a5ULL, /* $p1 st b64 wt g[$r0d+40] $r4d (04) */ + 0x8000000000001de7ULL /* exit (2e) */ +}; + +/* NOTE: intentionally using the same names as NV */ +static const char *nve4_pm_query_names[] = +{ + /* MP counters */ + "prof_trigger_00", + "prof_trigger_01", + "prof_trigger_02", + "prof_trigger_03", + "prof_trigger_04", + "prof_trigger_05", + "prof_trigger_06", + "prof_trigger_07", + "warps_launched", + "threads_launched", + "sm_cta_launched", + "inst_issued1", + "inst_issued2", + "inst_executed", + "local_load", + "local_store", + "shared_load", + "shared_store", + "l1_local_load_hit", + "l1_local_load_miss", + "l1_local_store_hit", + "l1_local_store_miss", + "gld_request", + "gst_request", + "l1_global_load_hit", + "l1_global_load_miss", + "uncached_global_load_transaction", + "global_store_transaction", + "branch", + "divergent_branch", + "active_warps", + "active_cycles" +}; + +/* For simplicity, we will allocate as many group slots as we allocate counter + * slots. This means that a single counter which wants to source from 2 groups + * will have to be declared as using 2 counter slots. This shouldn't really be + * a problem because such queries don't make much sense ... (unless someone is + * really creative). + */ +struct nve4_mp_counter_cfg +{ + uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ + uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ + uint32_t pad : 3; + uint32_t sig_dom : 1; /* if 0, MP_PM_A, if 1, MP_PM_B */ + uint32_t sig_sel : 8; /* signal group */ + uint32_t src_sel : 32; /* signal selection for up to 5 sources */ +}; + +struct nve4_mp_pm_query_cfg +{ + struct nve4_mp_counter_cfg ctr[4]; + uint8_t num_counters; + uint8_t op; /* PIPE_LOGICOP_CLEAR(for ADD),OR,AND */ +}; + +#define _Q1A(n, f, m, g, s) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, PIPE_LOGICOP_CLEAR } +#define _Q1B(n, f, m, g, s) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, PIPE_LOGICOP_CLEAR } + +static const struct nve4_mp_pm_query_cfg nve4_mp_pm_queries[] = +{ + _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000), + _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004), + _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008), + _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c), + _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010), + _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014), + _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018), + _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c), + _Q1A(LAUNCHED_WARPS, 0x0001, B6, LAUNCH, 0x00000004), + _Q1A(LAUNCHED_THREADS, 0x003f, B6, LAUNCH, 0x398a4188), + _Q1B(LAUNCHED_CTA, 0x0001, B6, WARP, 0x0000001c), + _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004), + _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008), + _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398), + _Q1A(LD_SHARED, 0x0001, B6, LDST, 0x00000000), + _Q1A(ST_SHARED, 0x0001, B6, LDST, 0x00000004), + _Q1A(LD_LOCAL, 0x0001, B6, LDST, 0x00000008), + _Q1A(ST_LOCAL, 0x0001, B6, LDST, 0x0000000c), + _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010), + _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014), + _Q1B(L1_LOCAL_LOAD_HIT, 0x0001, B6, L1, 0x00000000), + _Q1B(L1_LOCAL_LOAD_MISS, 0x0001, B6, L1, 0x00000004), + _Q1B(L1_LOCAL_STORE_HIT, 0x0001, B6, L1, 0x00000008), + _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c), + _Q1B(L1_GLOBAL_LOAD_HIT, 0x0001, B6, L1, 0x00000010), + _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014), + _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000), + _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004), + _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c), + _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010), + _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x398a4188), + _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000004) +}; + +#undef _Q1A +#undef _Q1B + +void +nve4_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const struct nve4_mp_pm_query_cfg *cfg; + unsigned i, c; + unsigned num_ab[2] = { 0, 0 }; + + cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + + /* check if we have enough free counter slots */ + for (i = 0; i < cfg->num_counters; ++i) + num_ab[cfg->ctr[i].sig_dom]++; + + if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 || + screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) { + NOUVEAU_ERR("Not enough free MP counter slots !\n"); + return; + } + + assert(cfg->num_counters <= 4); + PUSH_SPACE(push, 4 * 8 + 6); + + if (!screen->pm.mp_counters_enabled) { + screen->pm.mp_counters_enabled = TRUE; + BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); + PUSH_DATA (push, 0x1fcb); + } + + /* set sequence field to 0 (used to check if result is available) */ + for (i = 0; i < screen->mp_count; ++i) + q->data[i * 10 + 10] = 0; + + for (i = 0; i < cfg->num_counters; ++i) { + const unsigned d = cfg->ctr[i].sig_dom; + + if (!screen->pm.num_mp_pm_active[d]) { + uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); + if (screen->pm.num_mp_pm_active[!d]) + m |= 1 << (7 + (8 * d)); + BEGIN_NVC0(push, SUBC_SW(0x0600), 1); + PUSH_DATA (push, m); + } + screen->pm.num_mp_pm_active[d]++; + + for (c = d * 4; c < (d * 4 + 4); ++c) { + if (!screen->pm.mp_counter[c]) { + q->ctr[i] = c; + screen->pm.mp_counter[c] = (struct pipe_query *)q; + break; + } + } + assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ + + /* configure and reset the counter(s) */ + if (d == 0) + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); + else + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); + } +} + +static void +nve4_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_screen *screen = nvc0->screen; + struct pipe_context *pipe = &nvc0->base.pipe; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint32_t mask; + uint32_t input[3]; + const uint block[3] = { 32, 1, 1 }; + const uint grid[3] = { screen->mp_count, 1, 1 }; + unsigned c; + const struct nve4_mp_pm_query_cfg *cfg; + + cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + + if (unlikely(!screen->pm.prog)) { + struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); + prog->type = PIPE_SHADER_COMPUTE; + prog->translated = TRUE; + prog->num_gprs = 8; + prog->code = (uint32_t *)nve4_read_mp_pm_counters_code; + prog->code_size = sizeof(nve4_read_mp_pm_counters_code); + prog->parm_size = 12; + screen->pm.prog = prog; + } + + /* disable all counting */ + PUSH_SPACE(push, 8); + for (c = 0; c < 8; ++c) + if (screen->pm.mp_counter[c]) + IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); + /* release counters for this query */ + for (c = 0; c < 8; ++c) { + if (nvc0_query(screen->pm.mp_counter[c]) == q) { + screen->pm.num_mp_pm_active[c / 4]--; + screen->pm.mp_counter[c] = NULL; + } + } + + BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, + q->bo); + + pipe->bind_compute_state(pipe, screen->pm.prog); + input[0] = (q->bo->offset + q->base); + input[1] = (q->bo->offset + q->base) >> 32; + input[2] = q->sequence; + pipe->launch_grid(pipe, block, grid, 0, input); + + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); + + /* re-activate other counters */ + PUSH_SPACE(push, 16); + mask = 0; + for (c = 0; c < 8; ++c) { + unsigned i; + q = nvc0_query(screen->pm.mp_counter[c]); + if (!q) + continue; + cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + for (i = 0; i < cfg->num_counters; ++i) { + if (mask & (1 << q->ctr[i])) + break; + mask |= 1 << q->ctr[i]; + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + } + } +} + +static boolean +nve4_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, + void *result, boolean wait) +{ + uint32_t count[4]; + uint64_t value = 0; + unsigned p, c; + const struct nve4_mp_pm_query_cfg *cfg; + + cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + + for (p = 0; p < nvc0->screen->mp_count_compute; ++p) { + uint64_t clock; + const unsigned b = p * 12; + + clock = *(uint64_t *)&q->data[b + 8]; + (void)clock; /* might be interesting one day */ + + if (q->data[b + 10] != q->sequence) { + /* WARNING: This will spin forever if you loop with wait == FALSE and + * the push buffer hasn't been flushed ! + */ + if (!wait) + return FALSE; + if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return FALSE; + } + + for (c = 0; c < cfg->num_counters; ++c) + count[c] = q->data[b + q->ctr[c]]; + for (; c < 4; ++c) + count[c] = 0; + + switch (cfg->op) { + case PIPE_LOGICOP_AND: + value &= count[0] & count[1] & count[2] & count[3]; + break; + case PIPE_LOGICOP_OR: + value |= count[0] | count[1] | count[2] | count[3]; + break; + case PIPE_LOGICOP_CLEAR: /* abused as ADD */ + default: + value += count[0] + count[1] + count[2] + count[3]; + break; + } + } + *(uint64_t *)result = value; + return TRUE; +} + +int +nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, + unsigned id, + struct pipe_driver_query_info *info) +{ + struct nvc0_screen *screen = nvc0_screen(pscreen); + + if (screen->base.class_3d >= NVE4_3D_CLASS) { + unsigned count = 0; + if (screen->base.device->drm_version >= 0x01000101) + count = NVE4_PM_QUERY_COUNT; + if (!info) + return count; + if (id < count) { + info->name = nve4_pm_query_names[id]; + info->query_type = NVE4_PM_QUERY(id); + info->max_value = ~0ULL; + info->uses_byte_units = FALSE; + return 1; + } + } else { + if (!info) + return 0; + } + /* user asked for info about non-existing query */ + info->name = "this_is_not_the_query_you_are_looking_for"; + info->query_type = 0xdeadd01d; + info->max_value = 0; + info->uses_byte_units = FALSE; + return 0; +} + void nvc0_init_query_functions(struct nvc0_context *nvc0) { diff --git a/src/gallium/drivers/nvc0/nvc0_screen.c b/src/gallium/drivers/nvc0/nvc0_screen.c index bf353c48251..5b9385ad724 100644 --- a/src/gallium/drivers/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nvc0/nvc0_screen.c @@ -352,6 +352,10 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) if (screen->blitter) nvc0_blitter_destroy(screen); + if (screen->pm.prog) { + screen->pm.prog->code = NULL; /* hardcoded, don't FREE */ + nvc0_program_destroy(NULL, screen->pm.prog); + } nouveau_bo_ref(NULL, &screen->text); nouveau_bo_ref(NULL, &screen->uniform_bo); @@ -581,6 +585,7 @@ nvc0_screen_create(struct nouveau_device *dev) pscreen->get_param = nvc0_screen_get_param; pscreen->get_shader_param = nvc0_screen_get_shader_param; pscreen->get_paramf = nvc0_screen_get_paramf; + pscreen->get_driver_query_info = nvc0_screen_get_driver_query_info; nvc0_screen_init_resource_functions(pscreen); @@ -785,6 +790,7 @@ nvc0_screen_create(struct nouveau_device *dev) value = (16 << 8) | 4; } screen->mp_count = value >> 8; + screen->mp_count_compute = screen->mp_count; nvc0_screen_resize_tls_area(screen, 128 * 16, 0, 0x200); diff --git a/src/gallium/drivers/nvc0/nvc0_screen.h b/src/gallium/drivers/nvc0/nvc0_screen.h index 13dc83e7e8a..b7cfd05a2c0 100644 --- a/src/gallium/drivers/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nvc0/nvc0_screen.h @@ -39,6 +39,7 @@ struct nvc0_screen { struct nouveau_bo *poly_cache; uint16_t mp_count; + uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */ struct nouveau_heap *text_heap; struct nouveau_heap *lib_code; /* allocated from text_heap */ @@ -62,6 +63,13 @@ struct nvc0_screen { uint32_t *map; } fence; + struct { + struct nvc0_program *prog; /* compute state object to read MP counters */ + struct pipe_query *mp_counter[8]; /* counter to query allocation */ + uint8_t num_mp_pm_active[2]; + boolean mp_counters_enabled; + } pm; + struct nouveau_mman *mm_VRAM_fe0; struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */ @@ -76,6 +84,69 @@ nvc0_screen(struct pipe_screen *screen) return (struct nvc0_screen *)screen; } + +/* Performance counter queries: + */ +#define NVE4_PM_QUERY_COUNT 32 +#define NVE4_PM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) +#define NVE4_PM_QUERY_MAX NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1) +/* MP (NOTE: these are also used to index a table, so put them first) */ +#define NVE4_PM_QUERY_PROF_TRIGGER_0 0 +#define NVE4_PM_QUERY_PROF_TRIGGER_1 1 +#define NVE4_PM_QUERY_PROF_TRIGGER_2 2 +#define NVE4_PM_QUERY_PROF_TRIGGER_3 3 +#define NVE4_PM_QUERY_PROF_TRIGGER_4 4 +#define NVE4_PM_QUERY_PROF_TRIGGER_5 5 +#define NVE4_PM_QUERY_PROF_TRIGGER_6 6 +#define NVE4_PM_QUERY_PROF_TRIGGER_7 7 +#define NVE4_PM_QUERY_LAUNCHED_WARPS 8 +#define NVE4_PM_QUERY_LAUNCHED_THREADS 9 +#define NVE4_PM_QUERY_LAUNCHED_CTA 10 +#define NVE4_PM_QUERY_INST_ISSUED1 11 +#define NVE4_PM_QUERY_INST_ISSUED2 12 +#define NVE4_PM_QUERY_INST_EXECUTED 13 +#define NVE4_PM_QUERY_LD_LOCAL 14 +#define NVE4_PM_QUERY_ST_LOCAL 15 +#define NVE4_PM_QUERY_LD_SHARED 16 +#define NVE4_PM_QUERY_ST_SHARED 17 +#define NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT 18 +#define NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS 19 +#define NVE4_PM_QUERY_L1_LOCAL_STORE_HIT 20 +#define NVE4_PM_QUERY_L1_LOCAL_STORE_MISS 21 +#define NVE4_PM_QUERY_GLD_REQUEST 22 +#define NVE4_PM_QUERY_GST_REQUEST 23 +#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT 24 +#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS 25 +#define NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED 26 +#define NVE4_PM_QUERY_GST_TRANSACTIONS 27 +#define NVE4_PM_QUERY_BRANCH 28 +#define NVE4_PM_QUERY_BRANCH_DIVERGENT 29 +#define NVE4_PM_QUERY_ACTIVE_WARPS 30 +#define NVE4_PM_QUERY_ACTIVE_CYCLES 31 +/* Engines (PCOUNTER) */ +/* +#define NVE4_PM_QUERY_GR_IDLE 50 +#define NVE4_PM_QUERY_BSP_IDLE 51 +#define NVE4_PM_QUERY_VP_IDLE 52 +#define NVE4_PM_QUERY_PPP_IDLE 53 +#define NVE4_PM_QUERY_CE0_IDLE 54 +#define NVE4_PM_QUERY_CE1_IDLE 55 +#define NVE4_PM_QUERY_CE2_IDLE 56 +*/ +/* L2 queries (PCOUNTER) */ +/* +#define NVE4_PM_QUERY_L2_SUBP_WRITE_L1_SECTOR_QUERIES 57 +... +*/ +/* TEX queries (PCOUNTER) */ +/* +#define NVE4_PM_QUERY_TEX0_CACHE_SECTOR_QUERIES 58 +... +*/ + +int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned, + struct pipe_driver_query_info *); + boolean nvc0_blitter_create(struct nvc0_screen *); void nvc0_blitter_destroy(struct nvc0_screen *); diff --git a/src/gallium/drivers/nvc0/nvc0_winsys.h b/src/gallium/drivers/nvc0/nvc0_winsys.h index c13ebd5fb58..25183a45f20 100644 --- a/src/gallium/drivers/nvc0/nvc0_winsys.h +++ b/src/gallium/drivers/nvc0/nvc0_winsys.h @@ -65,6 +65,8 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define SUBC_COPY(m) 4, (m) #define NVE4_COPY(m) SUBC_COPY(NVE4_COPY_##n) +#define SUBC_SW(m) 7, (m) + static INLINE uint32_t NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size) { diff --git a/src/gallium/drivers/nvc0/nve4_compute.c b/src/gallium/drivers/nvc0/nve4_compute.c index 943ae78b479..89da7d568cb 100644 --- a/src/gallium/drivers/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nvc0/nve4_compute.c @@ -27,7 +27,9 @@ #include "nv50/codegen/nv50_ir_driver.h" +#ifdef DEBUG static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *); +#endif int @@ -477,7 +479,10 @@ nve4_launch_grid(struct pipe_context *pipe, goto out; nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout); - nve4_compute_dump_launch_desc(desc); +#ifdef DEBUG + if (debug_get_num_option("NV50_PROG_DEBUG", 0)) + nve4_compute_dump_launch_desc(desc); +#endif nve4_compute_upload_input(nvc0, input, block_layout, grid_layout); @@ -589,6 +594,7 @@ static const char *nve4_cache_split_name(unsigned value) } } +#ifdef DEBUG static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) { @@ -635,7 +641,9 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc) i, address, size, valid ? "" : " (invalid)"); } } +#endif +#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER static void nve4_compute_trap_info(struct nvc0_context *nvc0) { @@ -667,3 +675,4 @@ nve4_compute_trap_info(struct nvc0_context *nvc0) } info->lock = 0; } +#endif diff --git a/src/gallium/drivers/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nvc0/nve4_compute.xml.h index 2f110f57657..9a774668239 100644 --- a/src/gallium/drivers/nvc0/nve4_compute.xml.h +++ b/src/gallium/drivers/nvc0/nve4_compute.xml.h @@ -199,6 +199,76 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVE4_COMPUTE_UNK260c 0x0000260c +#define NVE4_COMPUTE_MP_PM_SET(i0) (0x0000335c + 0x4*(i0)) +#define NVE4_COMPUTE_MP_PM_SET__ESIZE 0x00000004 +#define NVE4_COMPUTE_MP_PM_SET__LEN 0x00000008 + +#define NVE4_COMPUTE_MP_PM_A_SIGSEL(i0) (0x0000337c + 0x4*(i0)) +#define NVE4_COMPUTE_MP_PM_A_SIGSEL__ESIZE 0x00000004 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL__LEN 0x00000004 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_NONE 0x00000000 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_USER 0x00000001 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH 0x00000003 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC 0x00000004 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE 0x00000005 +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST 0x0000001b +#define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH 0x0000001c + +#define NVE4_COMPUTE_MP_PM_B_SIGSEL(i0) (0x0000338c + 0x4*(i0)) +#define NVE4_COMPUTE_MP_PM_B_SIGSEL__ESIZE 0x00000004 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL__LEN 0x00000004 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL_NONE 0x00000000 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL_WARP 0x00000002 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1 0x00000010 +#define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM 0x00000011 + +#define NVE4_COMPUTE_MP_PM_SRCSEL(i0) (0x0000339c + 0x4*(i0)) +#define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE 0x00000004 +#define NVE4_COMPUTE_MP_PM_SRCSEL__LEN 0x00000008 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__MASK 0x00000003 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__SHIFT 0 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__MASK 0x0000001c +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__SHIFT 2 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__MASK 0x00000060 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__SHIFT 5 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__MASK 0x00000380 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__SHIFT 7 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__MASK 0x00000c00 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__SHIFT 10 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__MASK 0x00007000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__SHIFT 12 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__MASK 0x00018000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__SHIFT 15 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__MASK 0x000e0000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__SHIFT 17 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__MASK 0x00300000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__SHIFT 20 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__MASK 0x01c00000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__SHIFT 22 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__MASK 0x06000000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__SHIFT 25 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__MASK 0x38000000 +#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__SHIFT 27 + +#define NVE4_COMPUTE_MP_PM_FUNC(i0) (0x000033bc + 0x4*(i0)) +#define NVE4_COMPUTE_MP_PM_FUNC__ESIZE 0x00000004 +#define NVE4_COMPUTE_MP_PM_FUNC__LEN 0x00000008 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE__MASK 0x0000000f +#define NVE4_COMPUTE_MP_PM_FUNC_MODE__SHIFT 0 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP 0x00000000 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_PULSE 0x00000001 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_B6 0x00000002 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK3 0x00000003 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6 0x00000004 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6_PULSE 0x00000005 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK6 0x00000006 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK7 0x00000007 +#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK8 0x00000008 +#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__MASK 0x000ffff0 +#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__SHIFT 4 + +#define NVE4_COMPUTE_MP_PM_UNK33DC 0x000033dc + #define NVE4_COMPUTE_LAUNCH_DESC__SIZE 0x00000100 #define NVE4_COMPUTE_LAUNCH_DESC_6 0x00000018 #define NVE4_COMPUTE_LAUNCH_DESC_6_NOTIFY__MASK 0x00000c00 -- 2.30.2