nvc0: add compute invocation counter
authorRhys Perry <pendingchaos02@gmail.com>
Tue, 26 Jun 2018 23:04:41 +0000 (00:04 +0100)
committerIlia Mirkin <imirkin@alum.mit.edu>
Thu, 7 Feb 2019 00:35:57 +0000 (19:35 -0500)
The strategy is to keep a CPU-side counter of the direct invocations,
and a GPU-side counter of the indirect invocations, and then add them
together for queries.

The specific technique is a macro which multiplies a list of integers
together and accumulates the product into SCRATCH registers held inside
of the context. Another macro will read those values out and add them to
the passed-in cpu-side counter to be stored in a query buffer the same
way that all the other statistics are stored.

Original implementation by Rhys Perry, redone by Ilia Mirkin to use the
SCRATCH temporaries.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
src/gallium/drivers/nouveau/nvc0/nvc0_context.h
src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
src/gallium/drivers/nouveau/nvc0/nve4_compute.c

index 38c2e8684318de54a81959ed8f2dd76eabf674af..d6af8221b655f5171ea18acb7d7f2f4de9844c1e 100644 (file)
@@ -580,3 +580,93 @@ crs_loop:
    /* Enable */
    exit maddr 0x1452 /* CONSERVATIVE_RASTER */
    send 0x1
+
+/* NVC0_3D_MACRO_COMPUTE_COUNTER
+ *
+ * This macro takes 6 values, num_groups_* and group_size_*, and adds their
+ * product to the current value
+ *
+ * It's used for keeping track of the number of executed indirect
+ * compute invocations for statistics.
+ *
+ * SCRATCH[4] = current counter [low]
+ * SCRATCH[5] = current counter [high]
+ *
+ * arg     = number of parameters to muliply together, ideally 6
+ * parm[0] = num_groups_x
+ * parm[1] = num_groups_y
+ * parm[2] = num_groups_z
+ * parm[3] = group_size_x
+ * parm[4] = group_size_y
+ * parm[5] = group_size_z
+ */
+.section #mme9097_compute_counter
+   mov $r7 $r1
+   mov $r1 1 /* low result */
+   mov $r2 0 /* high result */
+iic_loop_start:
+   parm $r3 /* val, next integer to multiply in */
+   /* multiplication start - look at low bit, add if set, shift right/left */
+   mov $r4 0 /* low temp */
+   mov $r5 0 /* high temp */
+iic_mul_start: /* temp = result * val */
+   braz annul $r3 #iic_mul_done
+iic_mul_body:
+   mov $r6 (extrinsrt 0x0 $r3 0 1 0) /* val & 1 - check low bit */
+   braz $r6 #iic_mul_cont /* bit not set */
+   mov $r3 (extrinsrt 0x0 $r3 1 31 0) /* val >>= 1 - shift right */
+
+   mov $r4 (add $r4 $r1) /* temp += result */
+   mov $r5 (adc $r5 $r2)
+iic_mul_cont:
+   mov $r1 (add $r1 $r1) /* shift left, part 1 (result *= 2) */
+   bra #iic_mul_start
+   mov $r2 (adc $r2 $r2) /* shift left, part 2 */
+iic_mul_done:
+   /* decrease loop counter, keep going if necessary */
+   mov $r7 (add $r7 -1)
+   /* result = temp ( = result * val ) */
+   mov $r1 $r4
+   branz $r7 #iic_loop_start
+   mov $r2 $r5
+
+   /* increment current value by newly-calculated invocation count */
+   read $r3 0xd04 /* SCRATCH[4] */
+   read $r4 0xd05 /* SCRATCH[5] */
+   maddr 0x1d04 /* SCRATCH[4] */
+   exit send (add $r3 $r1)
+   send (adc $r4 $r2)
+
+/* NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY
+ *
+ * This macro writes out the indirect counter plus a direct value to
+ * the given address using QUERY_GET (64-bit value).
+ *
+ * arg     = direct counter low
+ * parm[0] = direct counter high
+ * parm[1] = query address high
+ * parm[2] = query address low
+ */
+.section #mme9097_compute_counter_to_query
+   parm $r2 /* counter high */
+   read $r3 0xd04 /* SCRATCH[4] */
+   read $r4 0xd05 /* SCRATCH[5] */
+   mov $r1 (add $r1 $r3)
+   mov $r2 (adc $r2 $r4)
+
+   parm $r3 maddr 0x16c0 /* QUERY_ADDRESS_HIGH */
+   parm $r4 send $r3
+   send $r4 /* r3 = addr high, r4 = addr low */
+   send $r1 /* sum low */
+   mov $r5 0x1000
+   send (extrinsrt 0x0 $r5 0x0 0x10 0x10) /* GET_SHORT */
+
+   /* add 4 to the address */
+   mov $r1 0x4
+   mov $r4 (add $r4 $r1) /* addr low */
+   mov $r3 (adc $r3 0x0) /* addr high */
+   maddr 0x16c0 /* QUERY_ADDRESS_HIGH */
+   send $r3 /* addr high */
+   send $r4 /* addr low */
+   exit send $r2 /* sum high */
+   send (extrinsrt 0x0 $r5 0x0 0x10 0x10) /* GET_SHORT */
index 49c08911142e94a84174eefe4d290ece73569803..f068367c84e528ec121b71ca3f11d9bc5a5f7a4d 100644 (file)
@@ -394,3 +394,57 @@ uint32_t mme9097_conservative_raster_state[] = {
        0x051480a1,
        0x00004041,
 };
+
+uint32_t mme9097_compute_counter[] = {
+/* 0x0003: iic_loop_start */
+       0x00000f11,
+/* 0x0006: iic_mul_start */
+/* 0x0007: iic_mul_body */
+       0x00004111,
+       0x00000211,
+/* 0x000c: iic_mul_cont */
+/* 0x000f: iic_mul_done */
+       0x00000301,
+       0x00000411,
+       0x00000511,
+       0x00025827,
+       0x0040c612,
+       0x00013007,
+       0x07c2c312,
+       0x00006410,
+       0x0002ad10,
+       0x00004910,
+       0xfffe4007,
+       0x00029210,
+       0xffffff11,
+       0x00002111,
+       0xfffcb817,
+       0x00002a11,
+       0x03410315,
+       0x03414415,
+       0x07410021,
+       0x000058c0,
+       0x0002a040,
+};
+
+uint32_t mme9097_compute_counter_to_query[] = {
+       0x00000201,
+       0x03410315,
+       0x03414415,
+       0x0000c910,
+       0x00031210,
+       0x05b00351,
+       0x00001c31,
+       0x00002041,
+       0x00000841,
+       0x04000511,
+       0x84014042,
+       0x00010111,
+       0x00006410,
+       0x00021b10,
+       0x05b00021,
+       0x00001841,
+       0x00002041,
+       0x000010c1,
+       0x84014042,
+};
index 28e1636732646813c93816f2a2f85683d67ac437..3ab2f5e3d7f147f42e64a1298848ca944ccad090 100644 (file)
@@ -500,4 +500,36 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF);
    nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES;
    nvc0->images_dirty[5] |= nvc0->images_valid[5];
+
+   nvc0_update_compute_invocations_counter(nvc0, info);
+}
+
+static void
+nvc0_compute_update_indirect_invocations(struct nvc0_context *nvc0,
+                                         const struct pipe_grid_info *info) {
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *res = nv04_resource(info->indirect);
+   uint32_t offset = res->offset + info->indirect_offset;
+
+   nouveau_pushbuf_space(push, 16, 0, 8);
+   PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+   BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER), 7);
+   PUSH_DATA(push, 6);
+   PUSH_DATA(push, info->block[0]);
+   PUSH_DATA(push, info->block[1]);
+   PUSH_DATA(push, info->block[2]);
+   nouveau_pushbuf_data(push, res->bo, offset,
+                        NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+}
+
+void
+nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0,
+                                        const struct pipe_grid_info *info) {
+   if (unlikely(info->indirect)) {
+      nvc0_compute_update_indirect_invocations(nvc0, info);
+   } else {
+      uint64_t invocations = info->block[0] * info->block[1] * info->block[2];
+      invocations *= info->grid[0] * info->grid[1] * info->grid[2];
+      nvc0->compute_invocations += invocations;
+   }
 }
index c13510626760f7262d7522aadfd9032212359472..4cfd207d4c0473b13a52c8522f890723909d9ab0 100644 (file)
@@ -282,6 +282,8 @@ struct nvc0_context {
    uint16_t images_valid[6];
 
    struct util_dynarray global_residents;
+
+   uint64_t compute_invocations;
 };
 
 static inline struct nvc0_context *
@@ -442,5 +444,7 @@ void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
 /* nvc0_compute.c */
 void nvc0_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
 void nvc0_compute_validate_globals(struct nvc0_context *);
+void nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0,
+                                             const struct pipe_grid_info *info);
 
 #endif
index 7aa06337950ab73e653c9e842381dca13f7c57c3..f4842fd6d68b3743d1422e3b02d2373108a9dc3a 100644 (file)
@@ -39,4 +39,8 @@
 
 #define NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE                        0x00003868
 
+#define NVC0_3D_MACRO_COMPUTE_COUNTER                          0x00003870
+
+#define NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY                 0x00003878
+
 #endif /* __NVC0_MACROS_H__ */
index f6d5d0f560206c68009bf681bea02b5756eda946..b6a214ccd490486776e68cbb4ed7728b8e236ca9 100644 (file)
@@ -122,6 +122,22 @@ nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
    FREE(hq);
 }
 
+static void
+nvc0_hw_query_write_compute_invocations(struct nvc0_context *nvc0,
+                                        struct nvc0_hw_query *hq,
+                                        uint32_t offset)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   nouveau_pushbuf_space(push, 16, 0, 8);
+   PUSH_REFN(push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER_TO_QUERY), 4);
+   PUSH_DATA (push, nvc0->compute_invocations);
+   PUSH_DATAh(push, nvc0->compute_invocations);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset + offset);
+}
+
 static boolean
 nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
@@ -198,7 +214,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
       nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
       nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
       nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
-      ((uint64_t *)hq->data)[(12 + 10) * 2] = 0;
+      nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xc0 + 0xa0);
       break;
    default:
       break;
@@ -271,7 +287,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
       nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
       nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
       nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
-      ((uint64_t *)hq->data)[10 * 2] = 0;
+      nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xa0);
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       /* This query is not issued on GPU because disjoint is forced to false */
@@ -354,9 +370,8 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
       res64[0] = data64[1] - data64[3];
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
-      for (i = 0; i < 10; ++i)
+      for (i = 0; i < 11; ++i)
          res64[i] = data64[i * 2] - data64[24 + i * 2];
-      result->pipeline_statistics.cs_invocations = 0;
       break;
    case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
       res32[0] = hq->data[1];
index 216fba49d9e659c5c34425bbddb374840707d8be..6a79fd9a9032c552a72b5871dda47e66ff352218 100644 (file)
@@ -1308,6 +1308,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
    MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);
    MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state);
+   MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter);
+   MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query);
    MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect);
 
    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
index fcd7d9537f98af204d15c2c258860ba83c1ff65d..c5e4dec20bd63910f37d1f4f4f89879778c74880 100644 (file)
@@ -779,6 +779,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
    PUSH_DATA (push, 0);
 
+   nvc0_update_compute_invocations_counter(nvc0, info);
+
 out:
    if (ret)
       NOUVEAU_ERR("Failed to launch grid !\n");