nvc0: implement MP performance counters
authorChristoph Bumiller <e0425955@student.tuwien.ac.at>
Wed, 27 Mar 2013 22:39:06 +0000 (23:39 +0100)
committerChristoph Bumiller <e0425955@student.tuwien.ac.at>
Thu, 28 Mar 2013 23:33:01 +0000 (00:33 +0100)
There's more, but this only adds (most) of the counters that are
handled directly by the shader processors.
The other counter domains are not handled on the multiprocessor and
there are no FIFO object methods for configuring them.
Instead, they have to be programmed by the kernel via PCOUNTER, and
the interface for this isn't in place yet.

src/gallium/drivers/nvc0/nvc0_context.h
src/gallium/drivers/nvc0/nvc0_program.c
src/gallium/drivers/nvc0/nvc0_query.c
src/gallium/drivers/nvc0/nvc0_screen.c
src/gallium/drivers/nvc0/nvc0_screen.h
src/gallium/drivers/nvc0/nvc0_winsys.h
src/gallium/drivers/nvc0/nve4_compute.c
src/gallium/drivers/nvc0/nve4_compute.xml.h

index d9aa3788cb04d6343e7435ec0d198faf5495f4c6..799d9b9460a1740b951015b927c7528d18e5c694 100644 (file)
@@ -84,7 +84,8 @@
 #define NVC0_BIND_CP_GLOBAL      49
 #define NVC0_BIND_CP_DESC        50
 #define NVC0_BIND_CP_SCREEN      51
-#define NVC0_BIND_CP_COUNT       52
+#define NVC0_BIND_CP_QUERY       52
+#define NVC0_BIND_CP_COUNT       53
 
 /* bufctx for other operations */
 #define NVC0_BIND_2D            0
index 22dfaf91aa2ad787893f782423288c8a49eca9d4..15f683c5295a81baca8aaf51ed34fb844f8d6a44 100644 (file)
@@ -777,8 +777,8 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (prog->mem)
       nouveau_heap_free(&prog->mem);
-
-   FREE(prog->code);
+   if (prog->code)
+      FREE(prog->code); /* may be 0 for hardcoded shaders */
    FREE(prog->immd_data);
    FREE(prog->relocs);
    if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
@@ -807,5 +807,5 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
    for (i = 0; i < prog->cp.num_syms; ++i)
       if (syms[i].label == label)
          return prog->code_base + base + syms[i].offset;
-   return ~0;
+   return prog->code_base; /* no symbols or symbol not found */
 }
index d329148de6280cf2991aa9b1fe2262a7adb01308..5c4431e5af52ee1d4ed0b3397c43070c56b6a6fb 100644 (file)
@@ -26,6 +26,7 @@
 
 #include "nvc0_context.h"
 #include "nouveau/nv_object.xml.h"
+#include "nve4_compute.xml.h"
 
 #define NVC0_QUERY_STATE_READY   0
 #define NVC0_QUERY_STATE_ACTIVE  1
@@ -36,6 +37,7 @@ struct nvc0_query {
    uint32_t *data;
    uint16_t type;
    uint16_t index;
+   int8_t ctr[4];
    uint32_t sequence;
    struct nouveau_bo *bo;
    uint32_t base;
@@ -49,6 +51,11 @@ struct nvc0_query {
 
 #define NVC0_QUERY_ALLOC_SPACE 256
 
+static void nve4_mp_pm_query_begin(struct nvc0_context *, struct nvc0_query *);
+static void nve4_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
+static boolean nve4_mp_pm_query_result(struct nvc0_context *,
+                                       struct nvc0_query *, void *, boolean);
+
 static INLINE struct nvc0_query *
 nvc0_query(struct pipe_query *pipe)
 {
@@ -132,6 +139,16 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type)
       space = 16;
       break;
    default:
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS &&
+          nvc0->screen->base.device->drm_version >= 0x01000101) {
+         if (type >= NVE4_PM_QUERY(0) &&
+             type <= NVE4_PM_QUERY_MAX) {
+            /* 8 counters per MP + clock */
+            space = 12 * nvc0->screen->mp_count * sizeof(uint32_t);
+            break;
+         }
+      }
+      debug_printf("invalid query type: %u\n", type);
       FREE(q);
       return NULL;
    }
@@ -244,6 +261,8 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
       break;
    default:
+      if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX)
+         nve4_mp_pm_query_begin(nvc0, q);
       break;
    }
    q->state = NVC0_QUERY_STATE_ACTIVE;
@@ -314,7 +333,8 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
       break;
    default:
-      assert(0);
+      if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX)
+         nve4_mp_pm_query_end(nvc0, q);
       break;
    }
 }
@@ -343,6 +363,9 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    uint64_t *data64 = (uint64_t *)q->data;
    unsigned i;
 
+   if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX)
+      return nve4_mp_pm_query_result(nvc0, q, result, wait);
+
    if (q->state != NVC0_QUERY_STATE_READY)
       nvc0_query_update(nvc0->screen->base.client, q);
 
@@ -399,6 +422,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res32[0] = q->data[1];
       break;
    default:
+      assert(0); /* can't happen, we don't create queries with invalid type */
       return FALSE;
    }
 
@@ -513,6 +537,373 @@ nvc0_so_target_save_offset(struct pipe_context *pipe,
    nvc0_query_end(pipe, targ->pq);
 }
 
+
+/* === PERFORMANCE MONITORING COUNTERS === */
+
+/* Code to read out MP counters: They are accessible via mmio, too, but let's
+ * just avoid mapping registers in userspace. We'd have to know which MPs are
+ * enabled/present, too, and that information is not presently exposed.
+ * We could add a kernel interface for it, but reading the counters like this
+ * has the advantage of being async (if get_result isn't called immediately).
+ */
+static const uint64_t nve4_read_mp_pm_counters_code[] =
+{
+   0x2042004270420047ULL, /* sched */
+   0x2800400000001de4ULL, /* mov b32 $r0 c0[0] (04) */
+   0x2c0000000c009c04ULL, /* mov b32 $r2 $physid (20) */
+   0x2800400010005de4ULL, /* mov b32 $r1 c0[4] (04) */
+   0x2c0000008400dc04ULL, /* mov b32 $r3 $tidx (27) */
+   0x7000c01050209c03ULL, /* ext u32 $r2 $r2 0x0414 (04) */
+   0x2c00000010011c04ULL, /* mov b32 $r4 $pm0 (20) */
+   0x190e0000fc33dc03ULL, /* set $p1 eq u32 $r3 0 (04) */
+   0x2280428042804277ULL, /* sched */
+   0x2c00000014015c04ULL, /* mov b32 $r5 $pm1 (27) */
+   0x10000000c0209c02ULL, /* mul $r2 u32 $r2 u32 48 (04) */
+   0x2c00000018019c04ULL, /* mov b32 $r6 $pm2 (28) */
+   0x4801000008001c03ULL, /* add b32 ($r0 $c) $r0 $r2 (04) */
+   0x2c0000001c01dc04ULL, /* mov b32 $r7 $pm3 (28) */
+   0x0800000000105c42ULL, /* add b32 $r1 $r1 0 $c (04) */
+   0x2c00000140009c04ULL, /* mov b32 $r2 $clock (28) */
+   0x2042804200420047ULL, /* sched */
+   0x94000000000107c5ULL, /* $p1 st b128 wt g[$r0d] $r4q (04) */
+   0x2c00000020011c04ULL, /* mov b32 $r4 $pm4 (20) */
+   0x2c00000024015c04ULL, /* mov b32 $r5 $pm5 (04) */
+   0x2c00000028019c04ULL, /* mov b32 $r6 $pm6 (20) */
+   0x2c0000002c01dc04ULL, /* mov b32 $r7 $pm7 (04) */
+   0x2c0000014400dc04ULL, /* mov b32 $r3 $clockhi (28) */
+   0x94000000400107c5ULL, /* $p1 st b128 wt g[$r0d+16] $r4q (04) */
+   0x200002e042804207ULL, /* sched */
+   0x2800400020011de4ULL, /* mov b32 $r4 c0[8] (20) */
+   0x2c0000000c015c04ULL, /* mov b32 $r5 $physid (04) */
+   0x94000000800087a5ULL, /* $p1 st b64 wt g[$r0d+32] $r2d (28) */
+   0x94000000a00107a5ULL, /* $p1 st b64 wt g[$r0d+40] $r4d (04) */
+   0x8000000000001de7ULL  /* exit (2e) */
+};
+
+/* NOTE: intentionally using the same names as NV */
+static const char *nve4_pm_query_names[] =
+{
+   /* MP counters */
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "warps_launched",
+   "threads_launched",
+   "sm_cta_launched",
+   "inst_issued1",
+   "inst_issued2",
+   "inst_executed",
+   "local_load",
+   "local_store",
+   "shared_load",
+   "shared_store",
+   "l1_local_load_hit",
+   "l1_local_load_miss",
+   "l1_local_store_hit",
+   "l1_local_store_miss",
+   "gld_request",
+   "gst_request",
+   "l1_global_load_hit",
+   "l1_global_load_miss",
+   "uncached_global_load_transaction",
+   "global_store_transaction",
+   "branch",
+   "divergent_branch",
+   "active_warps",
+   "active_cycles"
+};
+
+/* For simplicity, we will allocate as many group slots as we allocate counter
+ * slots. This means that a single counter which wants to source from 2 groups
+ * will have to be declared as using 2 counter slots. This shouldn't really be
+ * a problem because such queries don't make much sense ... (unless someone is
+ * really creative).
+ */
+struct nve4_mp_counter_cfg
+{
+   uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
+   uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
+   uint32_t pad     : 3;
+   uint32_t sig_dom : 1;  /* if 0, MP_PM_A, if 1, MP_PM_B */
+   uint32_t sig_sel : 8;  /* signal group */
+   uint32_t src_sel : 32; /* signal selection for up to 5 sources */
+};
+
+struct nve4_mp_pm_query_cfg
+{
+   struct nve4_mp_counter_cfg ctr[4];
+   uint8_t num_counters;
+   uint8_t op; /* PIPE_LOGICOP_CLEAR(for ADD),OR,AND */
+};
+
+#define _Q1A(n, f, m, g, s) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, PIPE_LOGICOP_CLEAR }
+#define _Q1B(n, f, m, g, s) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, PIPE_LOGICOP_CLEAR }
+
+static const struct nve4_mp_pm_query_cfg nve4_mp_pm_queries[] =
+{
+   _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000),
+   _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004),
+   _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008),
+   _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c),
+   _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010),
+   _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014),
+   _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018),
+   _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c),
+   _Q1A(LAUNCHED_WARPS,    0x0001, B6, LAUNCH, 0x00000004),
+   _Q1A(LAUNCHED_THREADS,  0x003f, B6, LAUNCH, 0x398a4188),
+   _Q1B(LAUNCHED_CTA,      0x0001, B6, WARP, 0x0000001c),
+   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004),
+   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008),
+   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398),
+   _Q1A(LD_SHARED,   0x0001, B6, LDST, 0x00000000),
+   _Q1A(ST_SHARED,   0x0001, B6, LDST, 0x00000004),
+   _Q1A(LD_LOCAL,    0x0001, B6, LDST, 0x00000008),
+   _Q1A(ST_LOCAL,    0x0001, B6, LDST, 0x0000000c),
+   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010),
+   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014),
+   _Q1B(L1_LOCAL_LOAD_HIT,   0x0001, B6, L1, 0x00000000),
+   _Q1B(L1_LOCAL_LOAD_MISS,  0x0001, B6, L1, 0x00000004),
+   _Q1B(L1_LOCAL_STORE_HIT,  0x0001, B6, L1, 0x00000008),
+   _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c),
+   _Q1B(L1_GLOBAL_LOAD_HIT,  0x0001, B6, L1, 0x00000010),
+   _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014),
+   _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000),
+   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004),
+   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c),
+   _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010),
+   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x398a4188),
+   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000004)
+};
+
+#undef _Q1A
+#undef _Q1B
+
+void
+nve4_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const struct nve4_mp_pm_query_cfg *cfg;
+   unsigned i, c;
+   unsigned num_ab[2] = { 0, 0 };
+
+   cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   /* check if we have enough free counter slots */
+   for (i = 0; i < cfg->num_counters; ++i)
+      num_ab[cfg->ctr[i].sig_dom]++;
+
+   if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 ||
+       screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return;
+   }
+
+   assert(cfg->num_counters <= 4);
+   PUSH_SPACE(push, 4 * 8 + 6);
+
+   if (!screen->pm.mp_counters_enabled) {
+      screen->pm.mp_counters_enabled = TRUE;
+      BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
+      PUSH_DATA (push, 0x1fcb);
+   }
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->mp_count; ++i)
+      q->data[i * 10 + 10] = 0;
+
+   for (i = 0; i < cfg->num_counters; ++i) {
+      const unsigned d = cfg->ctr[i].sig_dom;
+
+      if (!screen->pm.num_mp_pm_active[d]) {
+         uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
+         if (screen->pm.num_mp_pm_active[!d])
+            m |= 1 << (7 + (8 * d));
+         BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
+         PUSH_DATA (push, m);
+      }
+      screen->pm.num_mp_pm_active[d]++;
+
+      for (c = d * 4; c < (d * 4 + 4); ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            q->ctr[i] = c;
+            screen->pm.mp_counter[c] = (struct pipe_query *)q;
+            break;
+         }
+      }
+      assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
+
+      /* configure and reset the counter(s) */
+      if (d == 0)
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
+      else
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
+      PUSH_DATA (push, cfg->ctr[i].sig_sel);
+      BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
+      PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
+      BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
+      PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
+      PUSH_DATA (push, 0);
+   }
+}
+
+static void
+nve4_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint32_t mask;
+   uint32_t input[3];
+   const uint block[3] = { 32, 1, 1 };
+   const uint grid[3] = { screen->mp_count, 1, 1 };
+   unsigned c;
+   const struct nve4_mp_pm_query_cfg *cfg;
+
+   cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   if (unlikely(!screen->pm.prog)) {
+      struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
+      prog->type = PIPE_SHADER_COMPUTE;
+      prog->translated = TRUE;
+      prog->num_gprs = 8;
+      prog->code = (uint32_t *)nve4_read_mp_pm_counters_code;
+      prog->code_size = sizeof(nve4_read_mp_pm_counters_code);
+      prog->parm_size = 12;
+      screen->pm.prog = prog;
+   }
+
+   /* disable all counting */
+   PUSH_SPACE(push, 8);
+   for (c = 0; c < 8; ++c)
+      if (screen->pm.mp_counter[c])
+         IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
+   /* release counters for this query */
+   for (c = 0; c < 8; ++c) {
+      if (nvc0_query(screen->pm.mp_counter[c]) == q) {
+         screen->pm.num_mp_pm_active[c / 4]--;
+         screen->pm.mp_counter[c] = NULL;
+      }
+   }
+
+   BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
+                q->bo);
+
+   pipe->bind_compute_state(pipe, screen->pm.prog);
+   input[0] = (q->bo->offset + q->base);
+   input[1] = (q->bo->offset + q->base) >> 32;
+   input[2] = q->sequence;
+   pipe->launch_grid(pipe, block, grid, 0, input);
+
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
+
+   /* re-activate other counters */
+   PUSH_SPACE(push, 16);
+   mask = 0;
+   for (c = 0; c < 8; ++c) {
+      unsigned i;
+      q = nvc0_query(screen->pm.mp_counter[c]);
+      if (!q)
+         continue;
+      cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+      for (i = 0; i < cfg->num_counters; ++i) {
+         if (mask & (1 << q->ctr[i]))
+            break;
+         mask |= 1 << q->ctr[i];
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1);
+         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      }
+   }
+}
+
+static boolean
+nve4_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
+                        void *result, boolean wait)
+{
+   uint32_t count[4];
+   uint64_t value = 0;
+   unsigned p, c;
+   const struct nve4_mp_pm_query_cfg *cfg;
+
+   cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   for (p = 0; p < nvc0->screen->mp_count_compute; ++p) {
+      uint64_t clock;
+      const unsigned b = p * 12;
+
+      clock = *(uint64_t *)&q->data[b + 8];
+      (void)clock; /* might be interesting one day */
+
+      if (q->data[b + 10] != q->sequence) {
+         /* WARNING: This will spin forever if you loop with wait == FALSE and
+          * the push buffer hasn't been flushed !
+          */
+         if (!wait)
+            return FALSE;
+         if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
+            return FALSE;
+      }
+
+      for (c = 0; c < cfg->num_counters; ++c)
+         count[c] = q->data[b + q->ctr[c]];
+      for (; c < 4; ++c)
+         count[c] = 0;
+
+      switch (cfg->op) {
+      case PIPE_LOGICOP_AND:
+         value &= count[0] & count[1] & count[2] & count[3];
+         break;
+      case PIPE_LOGICOP_OR:
+         value |= count[0] | count[1] | count[2] | count[3];
+         break;
+      case PIPE_LOGICOP_CLEAR: /* abused as ADD */
+      default:
+         value += count[0] + count[1] + count[2] + count[3];
+         break;
+      }
+   }
+   *(uint64_t *)result = value;
+   return TRUE;
+}
+
+int
+nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
+                                  unsigned id,
+                                  struct pipe_driver_query_info *info)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+
+   if (screen->base.class_3d >= NVE4_3D_CLASS) {
+      unsigned count = 0;
+      if (screen->base.device->drm_version >= 0x01000101)
+         count = NVE4_PM_QUERY_COUNT;
+      if (!info)
+         return count;
+      if (id < count) {
+         info->name = nve4_pm_query_names[id];
+         info->query_type = NVE4_PM_QUERY(id);
+         info->max_value = ~0ULL;
+         info->uses_byte_units = FALSE;
+         return 1;
+      }
+   } else {
+      if (!info)
+         return 0;
+   }
+   /* user asked for info about non-existing query */
+   info->name = "this_is_not_the_query_you_are_looking_for";
+   info->query_type = 0xdeadd01d;
+   info->max_value = 0;
+   info->uses_byte_units = FALSE;
+   return 0;
+}
+
 void
 nvc0_init_query_functions(struct nvc0_context *nvc0)
 {
index bf353c482514c8881149772c5c83f87a3ea94ac5..5b9385ad7243f298383b6612ff571d594ead4e9e 100644 (file)
@@ -352,6 +352,10 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
 
    if (screen->blitter)
       nvc0_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nvc0_program_destroy(NULL, screen->pm.prog);
+   }
 
    nouveau_bo_ref(NULL, &screen->text);
    nouveau_bo_ref(NULL, &screen->uniform_bo);
@@ -581,6 +585,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    pscreen->get_param = nvc0_screen_get_param;
    pscreen->get_shader_param = nvc0_screen_get_shader_param;
    pscreen->get_paramf = nvc0_screen_get_paramf;
+   pscreen->get_driver_query_info = nvc0_screen_get_driver_query_info;
 
    nvc0_screen_init_resource_functions(pscreen);
 
@@ -785,6 +790,7 @@ nvc0_screen_create(struct nouveau_device *dev)
          value = (16 << 8) | 4;
    }
    screen->mp_count = value >> 8;
+   screen->mp_count_compute = screen->mp_count;
 
    nvc0_screen_resize_tls_area(screen, 128 * 16, 0, 0x200);
 
index 13dc83e7e8a6248761fc0bfffcea39001c8554a6..b7cfd05a2c090fac696199cf618b14cbf2dd23b9 100644 (file)
@@ -39,6 +39,7 @@ struct nvc0_screen {
    struct nouveau_bo *poly_cache;
 
    uint16_t mp_count;
+   uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */
 
    struct nouveau_heap *text_heap;
    struct nouveau_heap *lib_code; /* allocated from text_heap */
@@ -62,6 +63,13 @@ struct nvc0_screen {
       uint32_t *map;
    } fence;
 
+   struct {
+      struct nvc0_program *prog; /* compute state object to read MP counters */
+      struct pipe_query *mp_counter[8]; /* counter to query allocation */
+      uint8_t num_mp_pm_active[2];
+      boolean mp_counters_enabled;
+   } pm;
+
    struct nouveau_mman *mm_VRAM_fe0;
 
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
@@ -76,6 +84,69 @@ nvc0_screen(struct pipe_screen *screen)
    return (struct nvc0_screen *)screen;
 }
 
+
+/* Performance counter queries:
+ */
+#define NVE4_PM_QUERY_COUNT  32
+#define NVE4_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NVE4_PM_QUERY_MAX    NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1)
+/* MP (NOTE: these are also used to index a table, so put them first) */
+#define NVE4_PM_QUERY_PROF_TRIGGER_0            0
+#define NVE4_PM_QUERY_PROF_TRIGGER_1            1
+#define NVE4_PM_QUERY_PROF_TRIGGER_2            2
+#define NVE4_PM_QUERY_PROF_TRIGGER_3            3
+#define NVE4_PM_QUERY_PROF_TRIGGER_4            4
+#define NVE4_PM_QUERY_PROF_TRIGGER_5            5
+#define NVE4_PM_QUERY_PROF_TRIGGER_6            6
+#define NVE4_PM_QUERY_PROF_TRIGGER_7            7
+#define NVE4_PM_QUERY_LAUNCHED_WARPS            8
+#define NVE4_PM_QUERY_LAUNCHED_THREADS          9
+#define NVE4_PM_QUERY_LAUNCHED_CTA              10
+#define NVE4_PM_QUERY_INST_ISSUED1              11
+#define NVE4_PM_QUERY_INST_ISSUED2              12
+#define NVE4_PM_QUERY_INST_EXECUTED             13
+#define NVE4_PM_QUERY_LD_LOCAL                  14
+#define NVE4_PM_QUERY_ST_LOCAL                  15
+#define NVE4_PM_QUERY_LD_SHARED                 16
+#define NVE4_PM_QUERY_ST_SHARED                 17
+#define NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT         18
+#define NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS        19
+#define NVE4_PM_QUERY_L1_LOCAL_STORE_HIT        20
+#define NVE4_PM_QUERY_L1_LOCAL_STORE_MISS       21
+#define NVE4_PM_QUERY_GLD_REQUEST               22
+#define NVE4_PM_QUERY_GST_REQUEST               23
+#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT        24
+#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS       25
+#define NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED 26
+#define NVE4_PM_QUERY_GST_TRANSACTIONS          27
+#define NVE4_PM_QUERY_BRANCH                    28
+#define NVE4_PM_QUERY_BRANCH_DIVERGENT          29
+#define NVE4_PM_QUERY_ACTIVE_WARPS              30
+#define NVE4_PM_QUERY_ACTIVE_CYCLES             31
+/* Engines (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_GR_IDLE                   50
+#define NVE4_PM_QUERY_BSP_IDLE                  51
+#define NVE4_PM_QUERY_VP_IDLE                   52
+#define NVE4_PM_QUERY_PPP_IDLE                  53
+#define NVE4_PM_QUERY_CE0_IDLE                  54
+#define NVE4_PM_QUERY_CE1_IDLE                  55
+#define NVE4_PM_QUERY_CE2_IDLE                  56
+*/
+/* L2 queries (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_L2_SUBP_WRITE_L1_SECTOR_QUERIES 57
+...
+*/
+/* TEX queries (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_TEX0_CACHE_SECTOR_QUERIES 58
+...
+*/
+
+int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
+                                      struct pipe_driver_query_info *);
+
 boolean nvc0_blitter_create(struct nvc0_screen *);
 void nvc0_blitter_destroy(struct nvc0_screen *);
 
index c13ebd5fb582d74cd04552ac083bbec091b1671c..25183a45f20940fcd80c1c6ab503faf1a1703f23 100644 (file)
@@ -65,6 +65,8 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_COPY(m) 4, (m)
 #define NVE4_COPY(m) SUBC_COPY(NVE4_COPY_##n)
 
+#define SUBC_SW(m) 7, (m)
+
 static INLINE uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
 {
index 943ae78b479c556e59c6723e95fb9b1c8132a047..89da7d568cbfbe18a7181b990ec762b42953d9f7 100644 (file)
@@ -27,7 +27,9 @@
 
 #include "nv50/codegen/nv50_ir_driver.h"
 
+#ifdef DEBUG
 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
+#endif
 
 
 int
@@ -477,7 +479,10 @@ nve4_launch_grid(struct pipe_context *pipe,
       goto out;
 
    nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout);
-   nve4_compute_dump_launch_desc(desc);
+#ifdef DEBUG
+   if (debug_get_num_option("NV50_PROG_DEBUG", 0))
+      nve4_compute_dump_launch_desc(desc);
+#endif
 
    nve4_compute_upload_input(nvc0, input, block_layout, grid_layout);
 
@@ -589,6 +594,7 @@ static const char *nve4_cache_split_name(unsigned value)
    }
 }
 
+#ifdef DEBUG
 static void
 nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 {
@@ -635,7 +641,9 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
                    i, address, size, valid ? "" : "  (invalid)");
    }
 }
+#endif
 
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
 static void
 nve4_compute_trap_info(struct nvc0_context *nvc0)
 {
@@ -667,3 +675,4 @@ nve4_compute_trap_info(struct nvc0_context *nvc0)
    }
    info->lock = 0;
 }
+#endif
index 2f110f57657e1c8ad7016e2dbf435193174b1965..9a77466823963f9209440367115a8ddf93ac2c13 100644 (file)
@@ -199,6 +199,76 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define NVE4_COMPUTE_UNK260c                                   0x0000260c
 
+#define NVE4_COMPUTE_MP_PM_SET(i0)                            (0x0000335c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_SET__ESIZE                          0x00000004
+#define NVE4_COMPUTE_MP_PM_SET__LEN                            0x00000008
+
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL(i0)                               (0x0000337c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL__ESIZE                     0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL__LEN                       0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_NONE                       0x00000000
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_USER                       0x00000001
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH                     0x00000003
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC                       0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE                      0x00000005
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST                       0x0000001b
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH                     0x0000001c
+
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL(i0)                               (0x0000338c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL__ESIZE                     0x00000004
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL__LEN                       0x00000004
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_NONE                       0x00000000
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_WARP                       0x00000002
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1                         0x00000010
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM                                0x00000011
+
+#define NVE4_COMPUTE_MP_PM_SRCSEL(i0)                         (0x0000339c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE                       0x00000004
+#define NVE4_COMPUTE_MP_PM_SRCSEL__LEN                         0x00000008
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__MASK                   0x00000003
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__SHIFT                  0
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__MASK                   0x0000001c
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__SHIFT                  2
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__MASK                   0x00000060
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__SHIFT                  5
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__MASK                   0x00000380
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__SHIFT                  7
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__MASK                   0x00000c00
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__SHIFT                  10
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__MASK                   0x00007000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__SHIFT                  12
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__MASK                   0x00018000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__SHIFT                  15
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__MASK                   0x000e0000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__SHIFT                  17
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__MASK                   0x00300000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__SHIFT                  20
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__MASK                   0x01c00000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__SHIFT                  22
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__MASK                   0x06000000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__SHIFT                  25
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__MASK                   0x38000000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__SHIFT                  27
+
+#define NVE4_COMPUTE_MP_PM_FUNC(i0)                           (0x000033bc + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_FUNC__ESIZE                         0x00000004
+#define NVE4_COMPUTE_MP_PM_FUNC__LEN                           0x00000008
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE__MASK                     0x0000000f
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE__SHIFT                    0
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP                     0x00000000
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_PULSE               0x00000001
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_B6                                0x00000002
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK3                      0x00000003
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6                  0x00000004
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6_PULSE            0x00000005
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK6                      0x00000006
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK7                      0x00000007
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK8                      0x00000008
+#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__MASK                     0x000ffff0
+#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__SHIFT                    4
+
+#define NVE4_COMPUTE_MP_PM_UNK33DC                             0x000033dc
+
 #define NVE4_COMPUTE_LAUNCH_DESC__SIZE                         0x00000100
 #define NVE4_COMPUTE_LAUNCH_DESC_6                             0x00000018
 #define NVE4_COMPUTE_LAUNCH_DESC_6_NOTIFY__MASK                        0x00000c00