From 5b6f522fc29f4c8bbadd0466b6f61c1876c95807 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 27 Jun 2018 00:04:41 +0100 Subject: [PATCH] nvc0: add compute invocation counter The strategy is to keep a CPU-side counter of the direct invocations, and a GPU-side counter of the indirect invocations, and then add them together for queries. The specific technique is a macro which multiplies a list of integers together and accumulates the product into SCRATCH registers held inside of the context. Another macro will read those values out and add them to the passed-in cpu-side counter to be stored in a query buffer the same way that all the other statistics are stored. Original implementation by Rhys Perry, redone by Ilia Mirkin to use the SCRATCH temporaries. Signed-off-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/mme/com9097.mme | 90 +++++++++++++++++++ .../drivers/nouveau/nvc0/mme/com9097.mme.h | 54 +++++++++++ .../drivers/nouveau/nvc0/nvc0_compute.c | 32 +++++++ .../drivers/nouveau/nvc0/nvc0_context.h | 4 + .../drivers/nouveau/nvc0/nvc0_macros.h | 4 + .../drivers/nouveau/nvc0/nvc0_query_hw.c | 23 ++++- .../drivers/nouveau/nvc0/nvc0_screen.c | 2 + .../drivers/nouveau/nvc0/nve4_compute.c | 2 + 8 files changed, 207 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme index 38c2e868431..d6af8221b65 100644 --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme @@ -580,3 +580,93 @@ crs_loop: /* Enable */ exit maddr 0x1452 /* CONSERVATIVE_RASTER */ send 0x1 + +/* NVC0_3D_MACRO_COMPUTE_COUNTER + * + * This macro takes 6 values, num_groups_* and group_size_*, and adds their + * product to the current value + * + * It's used for keeping track of the number of executed indirect + * compute invocations for statistics. + * + * SCRATCH[4] = current counter [low] + * SCRATCH[5] = current counter [high] + * + * arg = number of parameters to muliply together, ideally 6 + * parm[0] = num_groups_x + * parm[1] = num_groups_y + * parm[2] = num_groups_z + * parm[3] = group_size_x + * parm[4] = group_size_y + * parm[5] = group_size_z + */ +.section #mme9097_compute_counter + mov $r7 $r1 + mov $r1 1 /* low result */ + mov $r2 0 /* high result */ +iic_loop_start: + parm $r3 /* val, next integer to multiply in */ + /* multiplication start - look at low bit, add if set, shift right/left */ + mov $r4 0 /* low temp */ + mov $r5 0 /* high temp */ +iic_mul_start: /* temp = result * val */ + braz annul $r3 #iic_mul_done +iic_mul_body: + mov $r6 (extrinsrt 0x0 $r3 0 1 0) /* val & 1 - check low bit */ + braz $r6 #iic_mul_cont /* bit not set */ + mov $r3 (extrinsrt 0x0 $r3 1 31 0) /* val >>= 1 - shift right */ + + mov $r4 (add $r4 $r1) /* temp += result */ + mov $r5 (adc $r5 $r2) +iic_mul_cont: + mov $r1 (add $r1 $r1) /* shift left, part 1 (result *= 2) */ + bra #iic_mul_start + mov $r2 (adc $r2 $r2) /* shift left, part 2 */ +iic_mul_done: + /* decrease loop counter, keep going if necessary */ + mov $r7 (add $r7 -1) + /* result = temp ( = result * val ) */ + mov $r1 $r4 + branz $r7 #iic_loop_start + mov $r2 $r5 + + /* increment current value by newly-calculated invocation count */ + read $r3 0xd04 /* SCRATCH[4] */ + read $r4 0xd05 /* SCRATCH[5] */ + maddr 0x1d04 /* SCRATCH[4] */ + exit send (add $r3 $r1) + send (adc $r4 $r2) + +/* NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY + * + * This macro writes out the indirect counter plus a direct value to + * the given address using QUERY_GET (64-bit value). + * + * arg = direct counter low + * parm[0] = direct counter high + * parm[1] = query address high + * parm[2] = query address low + */ +.section #mme9097_compute_counter_to_query + parm $r2 /* counter high */ + read $r3 0xd04 /* SCRATCH[4] */ + read $r4 0xd05 /* SCRATCH[5] */ + mov $r1 (add $r1 $r3) + mov $r2 (adc $r2 $r4) + + parm $r3 maddr 0x16c0 /* QUERY_ADDRESS_HIGH */ + parm $r4 send $r3 + send $r4 /* r3 = addr high, r4 = addr low */ + send $r1 /* sum low */ + mov $r5 0x1000 + send (extrinsrt 0x0 $r5 0x0 0x10 0x10) /* GET_SHORT */ + + /* add 4 to the address */ + mov $r1 0x4 + mov $r4 (add $r4 $r1) /* addr low */ + mov $r3 (adc $r3 0x0) /* addr high */ + maddr 0x16c0 /* QUERY_ADDRESS_HIGH */ + send $r3 /* addr high */ + send $r4 /* addr low */ + exit send $r2 /* sum high */ + send (extrinsrt 0x0 $r5 0x0 0x10 0x10) /* GET_SHORT */ diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h index 49c08911142..f068367c84e 100644 --- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h +++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h @@ -394,3 +394,57 @@ uint32_t mme9097_conservative_raster_state[] = { 0x051480a1, 0x00004041, }; + +uint32_t mme9097_compute_counter[] = { +/* 0x0003: iic_loop_start */ + 0x00000f11, +/* 0x0006: iic_mul_start */ +/* 0x0007: iic_mul_body */ + 0x00004111, + 0x00000211, +/* 0x000c: iic_mul_cont */ +/* 0x000f: iic_mul_done */ + 0x00000301, + 0x00000411, + 0x00000511, + 0x00025827, + 0x0040c612, + 0x00013007, + 0x07c2c312, + 0x00006410, + 0x0002ad10, + 0x00004910, + 0xfffe4007, + 0x00029210, + 0xffffff11, + 0x00002111, + 0xfffcb817, + 0x00002a11, + 0x03410315, + 0x03414415, + 0x07410021, + 0x000058c0, + 0x0002a040, +}; + +uint32_t mme9097_compute_counter_to_query[] = { + 0x00000201, + 0x03410315, + 0x03414415, + 0x0000c910, + 0x00031210, + 0x05b00351, + 0x00001c31, + 0x00002041, + 0x00000841, + 0x04000511, + 0x84014042, + 0x00010111, + 0x00006410, + 0x00021b10, + 0x05b00021, + 0x00001841, + 0x00002041, + 0x000010c1, + 0x84014042, +}; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 28e16367326..3ab2f5e3d7f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -500,4 +500,36 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF); nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES; nvc0->images_dirty[5] |= nvc0->images_valid[5]; + + nvc0_update_compute_invocations_counter(nvc0, info); +} + +static void +nvc0_compute_update_indirect_invocations(struct nvc0_context *nvc0, + const struct pipe_grid_info *info) { + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + nouveau_pushbuf_space(push, 16, 0, 8); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER), 7); + PUSH_DATA(push, 6); + PUSH_DATA(push, info->block[0]); + PUSH_DATA(push, info->block[1]); + PUSH_DATA(push, info->block[2]); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); +} + +void +nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0, + const struct pipe_grid_info *info) { + if (unlikely(info->indirect)) { + nvc0_compute_update_indirect_invocations(nvc0, info); + } else { + uint64_t invocations = info->block[0] * info->block[1] * info->block[2]; + invocations *= info->grid[0] * info->grid[1] * info->grid[2]; + nvc0->compute_invocations += invocations; + } } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index c1351062676..4cfd207d4c0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -282,6 +282,8 @@ struct nvc0_context { uint16_t images_valid[6]; struct util_dynarray global_residents; + + uint64_t compute_invocations; }; static inline struct nvc0_context * @@ -442,5 +444,7 @@ void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *); /* nvc0_compute.c */ void nvc0_launch_grid(struct pipe_context *, const struct pipe_grid_info *); void nvc0_compute_validate_globals(struct nvc0_context *); +void nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0, + const struct pipe_grid_info *info); #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h index 7aa06337950..f4842fd6d68 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h @@ -39,4 +39,8 @@ #define NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE 0x00003868 +#define NVC0_3D_MACRO_COMPUTE_COUNTER 0x00003870 + +#define NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY 0x00003878 + #endif /* __NVC0_MACROS_H__ */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index f6d5d0f5602..b6a214ccd49 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -122,6 +122,22 @@ nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) FREE(hq); } +static void +nvc0_hw_query_write_compute_invocations(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq, + uint32_t offset) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + + nouveau_pushbuf_space(push, 16, 0, 8); + PUSH_REFN(push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR); + BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER_TO_QUERY), 4); + PUSH_DATA (push, nvc0->compute_invocations); + PUSH_DATAh(push, nvc0->compute_invocations); + PUSH_DATAh(push, hq->bo->offset + hq->offset + offset); + PUSH_DATA (push, hq->bo->offset + hq->offset + offset); +} + static boolean nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) { @@ -198,7 +214,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ - ((uint64_t *)hq->data)[(12 + 10) * 2] = 0; + nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xc0 + 0xa0); break; default: break; @@ -271,7 +287,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ - ((uint64_t *)hq->data)[10 * 2] = 0; + nvc0_hw_query_write_compute_invocations(nvc0, hq, 0xa0); break; case PIPE_QUERY_TIMESTAMP_DISJOINT: /* This query is not issued on GPU because disjoint is forced to false */ @@ -354,9 +370,8 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, res64[0] = data64[1] - data64[3]; break; case PIPE_QUERY_PIPELINE_STATISTICS: - for (i = 0; i < 10; ++i) + for (i = 0; i < 11; ++i) res64[i] = data64[i * 2] - data64[24 + i * 2]; - result->pipeline_statistics.cs_invocations = 0; break; case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: res32[0] = hq->data[1]; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 216fba49d9e..6a79fd9a903 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -1308,6 +1308,8 @@ nvc0_screen_create(struct nouveau_device *dev) MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count); MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write); MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state); + MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter); + MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query); MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect); BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index fcd7d9537f9..c5e4dec20bd 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -779,6 +779,8 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); + nvc0_update_compute_invocations_counter(nvc0, info); + out: if (ret) NOUVEAU_ERR("Failed to launch grid !\n"); -- 2.30.2