freedreno/computerator: add performance counter support
authorRob Clark <robdclark@chromium.org>
Sun, 8 Mar 2020 23:42:23 +0000 (16:42 -0700)
committerMarge Bot <eric+marge@anholt.net>
Tue, 10 Mar 2020 16:52:02 +0000 (16:52 +0000)
Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4119>

src/freedreno/computerator/a6xx.c
src/freedreno/computerator/main.c
src/freedreno/computerator/main.h
src/freedreno/computerator/meson.build

index c9960d66a8dde7f2882bbd2fe628fa2a5c274676..df09116657c2061cf8a6ea2040fbc894094e8ab2 100644 (file)
@@ -40,9 +40,17 @@ struct a6xx_backend {
 
        unsigned seqno;
        struct fd_bo *control_mem;
+
+       struct fd_bo *query_mem;
+       const struct perfcntr *perfcntrs;
+       unsigned num_perfcntrs;
 };
 define_cast(backend, a6xx_backend);
 
+/*
+ * Data structures shared with GPU:
+ */
+
 /* This struct defines the layout of the fd6_context::control buffer: */
 struct fd6_control {
        uint32_t seqno;          /* seqno for async CP_EVENT_WRITE, etc */
@@ -65,6 +73,26 @@ struct fd6_control {
 #define control_ptr(a6xx_backend, member)  \
        (a6xx_backend)->control_mem, offsetof(struct fd6_control, member), 0, 0
 
+
+struct PACKED fd6_query_sample {
+       uint64_t start;
+       uint64_t result;
+       uint64_t stop;
+};
+
+
+/* offset of a single field of an array of fd6_query_sample: */
+#define query_sample_idx(a6xx_backend, idx, field)    \
+       (a6xx_backend)->query_mem,                        \
+       (idx * sizeof(struct fd6_query_sample)) +         \
+       offsetof(struct fd6_query_sample, field),         \
+       0, 0
+
+
+/*
+ * Backend implementation:
+ */
+
 static struct kernel *
 a6xx_assemble(struct backend *b, FILE *in)
 {
@@ -307,6 +335,8 @@ cache_flush(struct fd_ringbuffer *ring, struct kernel *kernel)
 static void
 a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit)
 {
+       struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
+       struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
        struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0,
                        FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
 
@@ -344,6 +374,34 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit
        OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_Y */
        OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_Z */
 
+       if (a6xx_backend->num_perfcntrs > 0) {
+               a6xx_backend->query_mem = fd_bo_new(a6xx_backend->dev,
+                       a6xx_backend->num_perfcntrs * sizeof(struct fd6_query_sample),
+                       DRM_FREEDRENO_GEM_TYPE_KMEM, "query");
+
+               /* configure the performance counters to count the requested
+                * countables:
+                */
+               for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
+                       const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
+
+                       OUT_PKT4(ring, counter->select_reg, 1);
+                       OUT_RING(ring, counter->selector);
+               }
+
+               OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
+
+               /* and snapshot the start values: */
+               for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
+                       const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
+
+                       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+                       OUT_RING(ring, CP_REG_TO_MEM_0_64B |
+                               CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
+                       OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, start));
+               }
+       }
+
        OUT_PKT7(ring, CP_EXEC_CS, 4);
        OUT_RING(ring, 0x00000000);
        OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0]));
@@ -352,9 +410,56 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], struct fd_submit *submit
 
        OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
 
+       if (a6xx_backend->num_perfcntrs > 0) {
+               /* snapshot the end values: */
+               for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
+                       const struct perfcntr *counter = &a6xx_backend->perfcntrs[i];
+
+                       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+                       OUT_RING(ring, CP_REG_TO_MEM_0_64B |
+                               CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
+                       OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, stop));
+               }
+
+               /* and compute the result: */
+               for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
+                       /* result += stop - start: */
+                       OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+                       OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+                                       CP_MEM_TO_MEM_0_NEG_C);
+                       OUT_RELOCW(ring, query_sample_idx(a6xx_backend, i, result));     /* dst */
+                       OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result));      /* srcA */
+                       OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop));        /* srcB */
+                       OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start));       /* srcC */
+               }
+       }
+
        cache_flush(ring, kernel);
 }
 
+static void
+a6xx_set_perfcntrs(struct backend *b, const struct perfcntr *perfcntrs,
+               unsigned num_perfcntrs)
+{
+       struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);
+
+       a6xx_backend->perfcntrs = perfcntrs;
+       a6xx_backend->num_perfcntrs = num_perfcntrs;
+}
+
+static void
+a6xx_read_perfcntrs(struct backend *b, uint64_t *results)
+{
+       struct a6xx_backend *a6xx_backend = to_a6xx_backend(b);
+
+       fd_bo_cpu_prep(a6xx_backend->query_mem, NULL, DRM_FREEDRENO_PREP_READ);
+       struct fd6_query_sample *samples = fd_bo_map(a6xx_backend->query_mem);
+
+       for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) {
+               results[i] = samples[i].result;
+       }
+}
+
 struct backend *
 a6xx_init(struct fd_device *dev, uint32_t gpu_id)
 {
@@ -364,6 +469,8 @@ a6xx_init(struct fd_device *dev, uint32_t gpu_id)
                .assemble = a6xx_assemble,
                .disassemble = a6xx_disassemble,
                .emit_grid = a6xx_emit_grid,
+               .set_perfcntrs = a6xx_set_perfcntrs,
+               .read_perfcntrs = a6xx_read_perfcntrs,
        };
 
        a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id);
index 691fd5a8a4ac9e613908ad8547e529359ba80d0c..618812841e4c4d016cdf300f48ef7f48e175c893 100644 (file)
  */
 
 #include <getopt.h>
+#include <inttypes.h>
+#include <locale.h>
 #include <xf86drm.h>
 
 #include "util/u_math.h"
 
+#include "perfcntrs/freedreno_perfcntr.h"
+
 #include "main.h"
 
 
@@ -91,13 +95,14 @@ dump_hex(void *buf, int sz)
        }
 }
 
-static const char *shortopts = "df:g:h";
+static const char *shortopts = "df:g:hp:";
 
 static const struct option longopts[] = {
        {"disasm",   no_argument,       0, 'd'},
        {"file",     required_argument, 0, 'f'},
        {"groups",   required_argument, 0, 'g'},
        {"help",     no_argument,       0, 'h'},
+       {"perfcntr", required_argument, 0, 'p'},
        {0, 0, 0, 0}
 };
 
@@ -111,18 +116,101 @@ usage(const char *name)
                "    -f, --file=FILE          read shader from file (instead of stdin)\n"
                "    -g, --groups=X,Y,Z       use specified group size\n"
                "    -h, --help               show this message\n"
+               "    -p, --perfcntr=LIST      sample specified performance counters (comma\n"
+               "                             separated list)\n"
                ,
                name);
 }
 
+/* performance counter description: */
+static unsigned num_groups;
+static const struct fd_perfcntr_group *groups;
+
+/* Track enabled counters per group: */
+static unsigned *enabled_counters;
+
+static void
+setup_counter(const char *name, struct perfcntr *c)
+{
+       for (int i = 0; i < num_groups; i++) {
+               const struct fd_perfcntr_group *group = &groups[i];
+
+               for (int j = 0; j < group->num_countables; j++) {
+                       const struct fd_perfcntr_countable *countable = &group->countables[j];
+
+                       if (strcmp(name, countable->name) != 0)
+                               continue;
+
+                       /*
+                        * Allocate a counter to use to monitor the requested countable:
+                        */
+                       if (enabled_counters[i] >= group->num_counters) {
+                               errx(-1, "Too many counters selected in group: %s", group->name);
+                       }
+
+                       unsigned idx = enabled_counters[i]++;
+                       const struct fd_perfcntr_counter *counter = &group->counters[idx];
+
+                       /*
+                        * And initialize the perfcntr struct, pulling together the info
+                        * about selected counter and countable, to simplify life for the
+                        * backend:
+                        */
+                       c->name           = name;
+                       c->select_reg     = counter->select_reg;
+                       c->counter_reg_lo = counter->counter_reg_lo;
+                       c->counter_reg_hi = counter->counter_reg_hi;
+                       c->selector       = countable->selector;
+
+                       return;
+               }
+       }
+
+       errx(-1, "could not find countable: %s", name);
+}
+
+static struct perfcntr *
+parse_perfcntrs(uint32_t gpu_id, const char *perfcntrstr, unsigned *num_perfcntrs)
+{
+       struct perfcntr *counters = NULL;
+       char *cnames, *s;
+       unsigned cnt = 0;
+
+       groups = fd_perfcntrs(gpu_id, &num_groups);
+       enabled_counters = calloc(num_groups, sizeof(enabled_counters[0]));
+
+       cnames = strdup(perfcntrstr);
+       while ((s = strstr(cnames, ","))) {
+               char *name = cnames;
+               s[0] = '\0';
+               cnames = &s[1];
+
+               counters = realloc(counters, ++cnt * sizeof(counters[0]));
+               setup_counter(name, &counters[cnt-1]);
+       }
+
+       char * name = cnames;
+       counters = realloc(counters, ++cnt * sizeof(counters[0]));
+       setup_counter(name, &counters[cnt-1]);
+
+       *num_perfcntrs = cnt;
+
+       return counters;
+}
+
 int
 main(int argc, char **argv)
 {
        FILE *in = stdin;
+       const char *perfcntrstr = NULL;
+       struct perfcntr *perfcntrs = NULL;
+       unsigned num_perfcntrs = 0;
        bool disasm = false;
        uint32_t grid[3] = {0};
        int opt, ret;
 
+       setlocale(LC_NUMERIC, "en_US.UTF-8");
+
        while ((opt = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
                switch (opt) {
                case 'd':
@@ -140,6 +228,9 @@ main(int argc, char **argv)
                        break;
                case 'h':
                        goto usage;
+               case 'p':
+                       perfcntrstr = optarg;
+                       break;
                default:
                        printf("unrecognized arg: %c\n", opt);
                        goto usage;
@@ -185,6 +276,14 @@ main(int argc, char **argv)
 
        struct fd_submit *submit = fd_submit_new(pipe);
 
+       if (perfcntrstr) {
+               if (!backend->set_perfcntrs) {
+                       err(1, "performance counters not supported");
+               }
+               perfcntrs = parse_perfcntrs(gpu_id, perfcntrstr, &num_perfcntrs);
+               backend->set_perfcntrs(backend, perfcntrs, num_perfcntrs);
+       }
+
        backend->emit_grid(kernel, grid, submit);
 
        fd_submit_flush(submit, -1, NULL, NULL);
@@ -198,6 +297,15 @@ main(int argc, char **argv)
                dump_float(map, kernel->buf_sizes[i] * 4);
        }
 
+       if (perfcntrstr) {
+               uint64_t results[num_perfcntrs];
+               backend->read_perfcntrs(backend, results);
+
+               for (unsigned i = 0; i < num_perfcntrs; i++) {
+                       printf("%s:\t%'"PRIu64"\n", perfcntrs[i].name, results[i]);
+               }
+       }
+
        return 0;
 
 usage:
index 9e9325aa7f694eea78962bc86301d83cfefc6f5a..57b1ac07cb6f250642f1b5a4c8f1aa42b9b80231 100644 (file)
@@ -46,12 +46,31 @@ struct kernel {
        struct fd_bo *bufs[MAX_BUFS];
 };
 
+struct perfcntr {
+       const char *name;
+
+       /* for backend to configure/read the counter, describes
+        * the selected counter:
+        */
+       unsigned select_reg;
+       unsigned counter_reg_lo;
+       unsigned counter_reg_hi;
+       /* and selected countable:
+        */
+       unsigned selector;
+};
+
 /* per-generation entry-points: */
 struct backend {
        struct kernel *(*assemble)(struct backend *b, FILE *in);
        void (*disassemble)(struct kernel *kernel, FILE *out);
        void (*emit_grid)(struct kernel *kernel, uint32_t grid[3],
                        struct fd_submit *submit);
+
+       /* performance-counter API: */
+       void (*set_perfcntrs)(struct backend *b, const struct perfcntr *perfcntrs,
+                       unsigned num_perfcntrs);
+       void (*read_perfcntrs)(struct backend *b, uint64_t *results);
 };
 
 #define define_cast(_from, _to)        \
index 68a58efeaa674f67ea09a249a78b6a5b39707938..80e6f66a772afdda01b92e3eae89977971806ef4 100644 (file)
@@ -56,6 +56,7 @@ computerator = executable(
   link_with : [
     libfreedreno_drm,
     libfreedreno_ir3,
+    libfreedreno_perfcntrs,
   ],
   dependencies : [
     dep_libdrm,