From 7329f8dd10673a474554fd881eb09d6908496e0f Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Sat, 2 Nov 2013 19:10:19 -0700 Subject: [PATCH] i965: Enumerate Observability Architecture counters on Gen5+. In addition to listing the counter names, we include several "remap" tables. Confusingly, counters are documented with names like "A23", are written to some buffer offset other than 23, and exposed by core Mesa under a counter ID that is different still. The first is inevitable; MI_REPORT_PERF_COUNT writes certain counters to fixed locations in the buffer. The latter could be avoided, but core Mesa uses the "Counters" array index as the ID for a counter. We could do remapping there, but it would just complicate the core Mesa code. Signed-off-by: Kenneth Graunke Reviewed-by: Eric Anholt --- src/mesa/drivers/dri/i965/brw_context.h | 9 + .../dri/i965/brw_performance_monitor.c | 297 ++++++++++++++++++ 2 files changed, 306 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index a2720d22c43..43d0bbf7a5e 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1402,6 +1402,15 @@ struct brw_context struct { /** A map from pipeline statistics counter IDs to MMIO addresses. */ const int *statistics_registers; + + /** + * Mapping from a uint32_t offset within an OA snapshot to the ID of + * the counter which MI_REPORT_PERF_COUNT stores there. + */ + const int *oa_snapshot_layout; + + /** Number of 32-bit entries in a hardware counter snapshot. */ + int entries_per_oa_snapshot; } perfmon; int num_atoms; diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c index 725e8bb4e1d..850dba78abc 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -25,6 +25,21 @@ * \file brw_performance_monitor.c * * Implementation of the GL_AMD_performance_monitor extension. + * + * On Gen5+ hardware, we have two sources of performance counter data: + * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and + * the Pipeline Statistics Registers. We expose both sets of raw data, + * as well as some useful processed values. + * + * The Observability Architecture (OA) counters for Gen6+ are documented + * in a separate document from the rest of the PRMs. It is available at: + * https://01.org/linuxgraphics/documentation/driver-documentation-prms + * => 2013 Intel Core Processor Family => Observability Performance Counters + * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.) + * + * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public + * documentation exists; our implementation is based on the source code for the + * intel_perf_counters utility (which is available as part of intel-gpu-tools). */ #include @@ -100,14 +115,88 @@ brw_perf_monitor(struct gl_perf_monitor_object *m) /** Performance Monitor Group IDs */ enum brw_counter_groups { + OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */ PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */ }; /** * Ironlake: * @{ + * + * The list of CHAPS counters unfortunately does not appear in any public + * documentation, but is available by reading the source code for the + * intel_perf_counters utility (shipped as part of intel-gpu-tools). */ +const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { + COUNTER("cycles the CS unit is starved"), + COUNTER("cycles the CS unit is stalled"), + COUNTER("cycles the VF unit is starved"), + COUNTER("cycles the VF unit is stalled"), + COUNTER("cycles the VS unit is starved"), + COUNTER("cycles the VS unit is stalled"), + COUNTER("cycles the GS unit is starved"), + COUNTER("cycles the GS unit is stalled"), + COUNTER("cycles the CL unit is starved"), + COUNTER("cycles the CL unit is stalled"), + COUNTER("cycles the SF unit is starved"), + COUNTER("cycles the SF unit is stalled"), + COUNTER("cycles the WZ unit is starved"), + COUNTER("cycles the WZ unit is stalled"), + COUNTER("Z buffer read/write"), + COUNTER("cycles each EU was active"), + COUNTER("cycles each EU was suspended"), + COUNTER("cycles threads loaded all EUs"), + COUNTER("cycles filtering active"), + COUNTER("cycles PS threads executed"), + COUNTER("subspans written to RC"), + COUNTER("bytes read for texture reads"), + COUNTER("texels returned from sampler"), + COUNTER("polygons not culled"), + COUNTER("clocks MASF has valid message"), + COUNTER("64b writes/reads from RC"), + COUNTER("reads on dataport"), + COUNTER("clocks MASF has valid msg not consumed by sampler"), + COUNTER("cycles any EU is stalled for math"), +}; + +const static int gen5_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 0, /* cycles the CS unit is starved */ + 1, /* cycles the CS unit is stalled */ + 2, /* cycles the VF unit is starved */ + 3, /* cycles the VF unit is stalled */ + 4, /* cycles the VS unit is starved */ + 5, /* cycles the VS unit is stalled */ + 6, /* cycles the GS unit is starved */ + 7, /* cycles the GS unit is stalled */ + 8, /* cycles the CL unit is starved */ + 9, /* cycles the CL unit is stalled */ + 10, /* cycles the SF unit is starved */ + 11, /* cycles the SF unit is stalled */ + 12, /* cycles the WZ unit is starved */ + 13, /* cycles the WZ unit is stalled */ + 14, /* Z buffer read/write */ + 15, /* cycles each EU was active */ + 16, /* cycles each EU was suspended */ + 17, /* cycles threads loaded all EUs */ + 18, /* cycles filtering active */ + 19, /* cycles PS threads executed */ + 20, /* subspans written to RC */ + 21, /* bytes read for texture reads */ + 22, /* texels returned from sampler */ + 23, /* polygons not culled */ + 24, /* clocks MASF has valid message */ + 25, /* 64b writes/reads from RC */ + 26, /* reads on dataport */ + 27, /* clocks MASF has valid msg not consumed by sampler */ + 28, /* cycles any EU is stalled for math */ +}; + const static struct gl_perf_monitor_group gen5_groups[] = { + [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters), /* Our pipeline statistics counter handling requires hardware contexts. */ }; /** @} */ @@ -115,7 +204,93 @@ const static struct gl_perf_monitor_group gen5_groups[] = { /** * Sandybridge: * @{ + * + * A few of the counters here (A17-A20) are not included in the latest + * documentation, but are described in the Ironlake PRM (which strangely + * documents Sandybridge's performance counter system, not Ironlake's). + * It's unclear whether they work or not; empirically, they appear to. + */ + +/** + * Aggregating counters A0-A28: + */ +const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { + /* A0: 0 */ COUNTER("Aggregated Core Array Active"), + /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"), + /* A2: 2 */ COUNTER("Vertex Shader Active Time"), + /* A3: Not actually hooked up on Sandybridge. */ + /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"), + /* A5: 4 */ COUNTER("# VS threads loaded"), + /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"), + /* A7: 6 */ COUNTER("Geometry Shader Active Time"), + /* A8: Not actually hooked up on Sandybridge. */ + /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"), + /* A10: 8 */ COUNTER("# GS threads loaded"), + /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"), + /* A12: 10 */ COUNTER("Pixel Shader Active Time"), + /* A13: Not actually hooked up on Sandybridge. */ + /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"), + /* A15: 12 */ COUNTER("# PS threads loaded"), + /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"), + /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"), + /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"), + /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"), + /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"), + /* A21: 18 */ COUNTER("Pixel Kill Count"), + /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"), + /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"), + /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"), + /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"), + /* A26: 23 */ COUNTER("GPU Busy"), + /* A27: 24 */ COUNTER("CL active and not stalled"), + /* A28: 25 */ COUNTER("SF active and stalled"), +}; + +/** + * Sandybridge: Counter Select = 001 + * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID + * A5 A6 A7 A8 A9 A10 A11 A12 + * A13 A14 A15 A16 A17 A18 A19 A20 + * A21 A22 A23 A24 A25 A26 A27 A28 + * + * (Yes, this is a strange order.) We also have to remap for missing counters. */ +const static int gen6_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 3, /* A4: Vertex Shader Stall Time - Core Stall */ + -1, /* A3: (not available) */ + 2, /* A2: Vertex Shader Active Time */ + 1, /* A1: Aggregated Core Array Stalled */ + 0, /* A0: Aggregated Core Array Active */ + 10, /* A12: Pixel Shader Active Time */ + 9, /* A11: Geometry Shader ready but not running Time */ + 8, /* A10: # GS threads loaded */ + 7, /* A9: Geometry Shader Stall Time - Core Stall */ + -1, /* A8: (not available) */ + 6, /* A7: Geometry Shader Active Time */ + 5, /* A6: Vertex Shader ready but not running Time */ + 4, /* A5: # VS Threads Loaded */ + 17, /* A20: Early Stencil Test Pixels Failing */ + 16, /* A19: Early Stencil Test Pixels Passing */ + 15, /* A18: Early Z Test Pixels Failing */ + 14, /* A17: Early Z Test Pixels Passing */ + 13, /* A16: Pixel Shader ready but not running Time */ + 12, /* A15: # PS threads loaded */ + 11, /* A14: Pixel Shader Stall Time - Core Stall */ + -1, /* A13: (not available) */ + 25, /* A28: SF active and stalled */ + 24, /* A27: CL active and not stalled */ + 23, /* A26: GPU Busy */ + 22, /* A25: Pixels/samples Written in the frame buffer */ + 21, /* A24: Post PS Z buffer Pixels Failed */ + 20, /* A23: Post PS Stencil Pixels Failed */ + 19, /* A22: Alpha Test Pixels Failed */ + 18, /* A21: Pixel Kill Count */ +}; + const static struct gl_perf_monitor_counter gen6_statistics_counters[] = { COUNTER64("IA_VERTICES_COUNT"), COUNTER64("IA_PRIMITIVES_COUNT"), @@ -146,6 +321,7 @@ const static int gen6_statistics_register_addresses[] = { }; const static struct gl_perf_monitor_group gen6_groups[] = { + GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters), GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters), }; /** @} */ @@ -154,6 +330,120 @@ const static struct gl_perf_monitor_group gen6_groups[] = { * Ivybridge/Baytrail/Haswell: * @{ */ +const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { + COUNTER("Aggregated Core Array Active"), + COUNTER("Aggregated Core Array Stalled"), + COUNTER("Vertex Shader Active Time"), + COUNTER("Vertex Shader Stall Time - Core Stall"), + COUNTER("# VS threads loaded"), + COUNTER("Hull Shader Active Time"), + COUNTER("Hull Shader Stall Time - Core Stall"), + COUNTER("# HS threads loaded"), + COUNTER("Domain Shader Active Time"), + COUNTER("Domain Shader Stall Time - Core Stall"), + COUNTER("# DS threads loaded"), + COUNTER("Compute Shader Active Time"), + COUNTER("Compute Shader Stall Time - Core Stall"), + COUNTER("# CS threads loaded"), + COUNTER("Geometry Shader Active Time"), + COUNTER("Geometry Shader Stall Time - Core Stall"), + COUNTER("# GS threads loaded"), + COUNTER("Pixel Shader Active Time"), + COUNTER("Pixel Shader Stall Time - Core Stall"), + COUNTER("# PS threads loaded"), + COUNTER("HiZ Fast Z Test Pixels Passing"), + COUNTER("HiZ Fast Z Test Pixels Failing"), + COUNTER("Slow Z Test Pixels Passing"), + COUNTER("Slow Z Test Pixels Failing"), + COUNTER("Pixel Kill Count"), + COUNTER("Alpha Test Pixels Failed"), + COUNTER("Post PS Stencil Pixels Failed"), + COUNTER("Post PS Z buffer Pixels Failed"), + COUNTER("3D/GPGPU Render Target Writes"), + COUNTER("Render Engine Busy"), + COUNTER("VS bottleneck"), + COUNTER("GS bottleneck"), +}; + +/** + * Ivybridge/Baytrail/Haswell: Counter Select = 101 + * A4 A3 A2 A1 A0 TIMESTAMP ReportID + * A12 A11 A10 A9 A8 A7 A6 A5 + * A20 A19 A18 A17 A16 A15 A14 A13 + * A28 A27 A26 A25 A24 A23 A22 A21 + * A36 A35 A34 A33 A32 A31 A30 A29 + * A44 A43 A42 A41 A40 A39 A38 A37 + * B7 B6 B5 B4 B3 B2 B1 B0 + * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv + */ +const static int gen7_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 0, /* A0: Aggregated Core Array Active */ + 1, /* A1: Aggregated Core Array Stalled */ + 2, /* A2: Vertex Shader Active Time */ + -1, /* A3: Reserved */ + 3, /* A4: Vertex Shader Stall Time - Core Stall */ + 4, /* A5: # VS threads loaded */ + -1, /* A6: Reserved */ + 5, /* A7: Hull Shader Active Time */ + -1, /* A8: Reserved */ + 6, /* A9: Hull Shader Stall Time - Core Stall */ + 7, /* A10: # HS threads loaded */ + -1, /* A11: Reserved */ + 8, /* A12: Domain Shader Active Time */ + -1, /* A13: Reserved */ + 9, /* A14: Domain Shader Stall Time - Core Stall */ + 10, /* A15: # DS threads loaded */ + -1, /* A16: Reserved */ + 11, /* A17: Compute Shader Active Time */ + -1, /* A18: Reserved */ + 12, /* A19: Compute Shader Stall Time - Core Stall */ + 13, /* A20: # CS threads loaded */ + -1, /* A21: Reserved */ + 14, /* A22: Geometry Shader Active Time */ + -1, /* A23: Reserved */ + 15, /* A24: Geometry Shader Stall Time - Core Stall */ + 16, /* A25: # GS threads loaded */ + -1, /* A26: Reserved */ + 17, /* A27: Pixel Shader Active Time */ + -1, /* A28: Reserved */ + 18, /* A29: Pixel Shader Stall Time - Core Stall */ + 19, /* A30: # PS threads loaded */ + -1, /* A31: Reserved */ + 20, /* A32: HiZ Fast Z Test Pixels Passing */ + 21, /* A33: HiZ Fast Z Test Pixels Failing */ + 22, /* A34: Slow Z Test Pixels Passing */ + 23, /* A35: Slow Z Test Pixels Failing */ + 24, /* A36: Pixel Kill Count */ + 25, /* A37: Alpha Test Pixels Failed */ + 26, /* A38: Post PS Stencil Pixels Failed */ + 27, /* A39: Post PS Z buffer Pixels Failed */ + 28, /* A40: 3D/GPGPU Render Target Writes */ + 29, /* A41: Render Engine Busy */ + 30, /* A42: VS bottleneck */ + 31, /* A43: GS bottleneck */ + -1, /* A44: Reserved */ + -1, /* B0 */ + -1, /* B1 */ + -1, /* B2 */ + -1, /* B3 */ + -1, /* B4 */ + -1, /* B5 */ + -1, /* B6 */ + -1, /* B7 */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ +}; + const static struct gl_perf_monitor_counter gen7_statistics_counters[] = { COUNTER64("IA_VERTICES_COUNT"), COUNTER64("IA_PRIMITIVES_COUNT"), @@ -200,6 +490,7 @@ const static int gen7_statistics_register_addresses[] = { }; const static struct gl_perf_monitor_group gen7_groups[] = { + GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters), GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters), }; /** @} */ @@ -481,13 +772,19 @@ brw_init_performance_monitors(struct brw_context *brw) if (brw->gen == 5) { ctx->PerfMonitor.Groups = gen5_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups); + brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout); } else if (brw->gen == 6) { ctx->PerfMonitor.Groups = gen6_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups); + brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout); brw->perfmon.statistics_registers = gen6_statistics_register_addresses; } else if (brw->gen == 7) { ctx->PerfMonitor.Groups = gen7_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups); + brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout); brw->perfmon.statistics_registers = gen7_statistics_register_addresses; } } -- 2.30.2