X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_performance_monitor.c;h=f8e50e10fa3a611c4bc6d9944870044513a18e18;hb=0e657b7b55bc7c83c8eb5258cd9522b0e5e581b7;hp=34f2aa3e65fdd3e16501aff8f6501d3a2fce4fc0;hpb=569adb40d7f3fa5084c20dd4b774fb1482d00c9c;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c index 34f2aa3e65f..f8e50e10fa3 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -25,17 +25,32 @@ * \file brw_performance_monitor.c * * Implementation of the GL_AMD_performance_monitor extension. + * + * On Gen5+ hardware, we have two sources of performance counter data: + * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and + * the Pipeline Statistics Registers. We expose both sets of raw data, + * as well as some useful processed values. + * + * The Observability Architecture (OA) counters for Gen6+ are documented + * in a separate document from the rest of the PRMs. It is available at: + * https://01.org/linuxgraphics/documentation/driver-documentation-prms + * => 2013 Intel Core Processor Family => Observability Performance Counters + * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.) + * + * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public + * documentation exists; our implementation is based on the source code for the + * intel_perf_counters utility (which is available as part of intel-gpu-tools). */ #include -#include "main/bitset.h" +#include "util/bitset.h" #include "main/hash.h" #include "main/macros.h" #include "main/mtypes.h" #include "main/performance_monitor.h" -#include "glsl/ralloc.h" +#include "util/ralloc.h" #include "brw_context.h" #include "brw_defines.h" @@ -50,6 +65,37 @@ struct brw_perf_monitor_object { /** The base class. */ struct gl_perf_monitor_object base; + + /** + * BO containing OA counter snapshots at monitor Begin/End time. + */ + drm_intel_bo *oa_bo; + + /** Indexes into bookend_bo (snapshot numbers) for various segments. */ + int oa_head_end; + int oa_middle_start; + int oa_tail_start; + + /** + * Storage for OA results accumulated so far. + * + * An array indexed by the counter ID in the OA_COUNTERS group. + * + * When we run out of space in bookend_bo, we compute the results so far + * and add them to the value stored here. Then, we can discard bookend_bo. + */ + uint32_t *oa_results; + + /** + * BO containing starting and ending snapshots for any active pipeline + * statistics counters. + */ + drm_intel_bo *pipeline_stats_bo; + + /** + * Storage for final pipeline statistics counter results. + */ + uint64_t *pipeline_stats_results; }; /** Downcasting convenience macro. */ @@ -59,6 +105,11 @@ brw_perf_monitor(struct gl_perf_monitor_object *m) return (struct brw_perf_monitor_object *) m; } +#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048 + +/* A random value used to ensure we're getting valid snapshots. */ +#define REPORT_ID 0xd2e9c607 + /******************************************************************************/ #define COUNTER(name) \ @@ -85,11 +136,90 @@ brw_perf_monitor(struct gl_perf_monitor_object *m) .NumCounters = ARRAY_SIZE(counter_list), \ } +/** Performance Monitor Group IDs */ +enum brw_counter_groups { + OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */ + PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */ +}; + /** * Ironlake: * @{ + * + * The list of CHAPS counters unfortunately does not appear in any public + * documentation, but is available by reading the source code for the + * intel_perf_counters utility (shipped as part of intel-gpu-tools). */ -const static struct gl_perf_monitor_group gen5_groups[] = { +static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { + COUNTER("cycles the CS unit is starved"), + COUNTER("cycles the CS unit is stalled"), + COUNTER("cycles the VF unit is starved"), + COUNTER("cycles the VF unit is stalled"), + COUNTER("cycles the VS unit is starved"), + COUNTER("cycles the VS unit is stalled"), + COUNTER("cycles the GS unit is starved"), + COUNTER("cycles the GS unit is stalled"), + COUNTER("cycles the CL unit is starved"), + COUNTER("cycles the CL unit is stalled"), + COUNTER("cycles the SF unit is starved"), + COUNTER("cycles the SF unit is stalled"), + COUNTER("cycles the WZ unit is starved"), + COUNTER("cycles the WZ unit is stalled"), + COUNTER("Z buffer read/write"), + COUNTER("cycles each EU was active"), + COUNTER("cycles each EU was suspended"), + COUNTER("cycles threads loaded all EUs"), + COUNTER("cycles filtering active"), + COUNTER("cycles PS threads executed"), + COUNTER("subspans written to RC"), + COUNTER("bytes read for texture reads"), + COUNTER("texels returned from sampler"), + COUNTER("polygons not culled"), + COUNTER("clocks MASF has valid message"), + COUNTER("64b writes/reads from RC"), + COUNTER("reads on dataport"), + COUNTER("clocks MASF has valid msg not consumed by sampler"), + COUNTER("cycles any EU is stalled for math"), +}; + +static const int gen5_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 0, /* cycles the CS unit is starved */ + 1, /* cycles the CS unit is stalled */ + 2, /* cycles the VF unit is starved */ + 3, /* cycles the VF unit is stalled */ + 4, /* cycles the VS unit is starved */ + 5, /* cycles the VS unit is stalled */ + 6, /* cycles the GS unit is starved */ + 7, /* cycles the GS unit is stalled */ + 8, /* cycles the CL unit is starved */ + 9, /* cycles the CL unit is stalled */ + 10, /* cycles the SF unit is starved */ + 11, /* cycles the SF unit is stalled */ + 12, /* cycles the WZ unit is starved */ + 13, /* cycles the WZ unit is stalled */ + 14, /* Z buffer read/write */ + 15, /* cycles each EU was active */ + 16, /* cycles each EU was suspended */ + 17, /* cycles threads loaded all EUs */ + 18, /* cycles filtering active */ + 19, /* cycles PS threads executed */ + 20, /* subspans written to RC */ + 21, /* bytes read for texture reads */ + 22, /* texels returned from sampler */ + 23, /* polygons not culled */ + 24, /* clocks MASF has valid message */ + 25, /* 64b writes/reads from RC */ + 26, /* reads on dataport */ + 27, /* clocks MASF has valid msg not consumed by sampler */ + 28, /* cycles any EU is stalled for math */ +}; + +static const struct gl_perf_monitor_group gen5_groups[] = { + [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters), /* Our pipeline statistics counter handling requires hardware contexts. */ }; /** @} */ @@ -97,8 +227,125 @@ const static struct gl_perf_monitor_group gen5_groups[] = { /** * Sandybridge: * @{ + * + * A few of the counters here (A17-A20) are not included in the latest + * documentation, but are described in the Ironlake PRM (which strangely + * documents Sandybridge's performance counter system, not Ironlake's). + * It's unclear whether they work or not; empirically, they appear to. */ -const static struct gl_perf_monitor_group gen6_groups[] = { + +/** + * Aggregating counters A0-A28: + */ +static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { + /* A0: 0 */ COUNTER("Aggregated Core Array Active"), + /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"), + /* A2: 2 */ COUNTER("Vertex Shader Active Time"), + /* A3: Not actually hooked up on Sandybridge. */ + /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"), + /* A5: 4 */ COUNTER("# VS threads loaded"), + /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"), + /* A7: 6 */ COUNTER("Geometry Shader Active Time"), + /* A8: Not actually hooked up on Sandybridge. */ + /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"), + /* A10: 8 */ COUNTER("# GS threads loaded"), + /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"), + /* A12: 10 */ COUNTER("Pixel Shader Active Time"), + /* A13: Not actually hooked up on Sandybridge. */ + /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"), + /* A15: 12 */ COUNTER("# PS threads loaded"), + /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"), + /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"), + /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"), + /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"), + /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"), + /* A21: 18 */ COUNTER("Pixel Kill Count"), + /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"), + /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"), + /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"), + /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"), + /* A26: 23 */ COUNTER("GPU Busy"), + /* A27: 24 */ COUNTER("CL active and not stalled"), + /* A28: 25 */ COUNTER("SF active and stalled"), +}; + +/** + * Sandybridge: Counter Select = 001 + * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID + * A5 A6 A7 A8 A9 A10 A11 A12 + * A13 A14 A15 A16 A17 A18 A19 A20 + * A21 A22 A23 A24 A25 A26 A27 A28 + * + * (Yes, this is a strange order.) We also have to remap for missing counters. + */ +static const int gen6_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 3, /* A4: Vertex Shader Stall Time - Core Stall */ + -1, /* A3: (not available) */ + 2, /* A2: Vertex Shader Active Time */ + 1, /* A1: Aggregated Core Array Stalled */ + 0, /* A0: Aggregated Core Array Active */ + 10, /* A12: Pixel Shader Active Time */ + 9, /* A11: Geometry Shader ready but not running Time */ + 8, /* A10: # GS threads loaded */ + 7, /* A9: Geometry Shader Stall Time - Core Stall */ + -1, /* A8: (not available) */ + 6, /* A7: Geometry Shader Active Time */ + 5, /* A6: Vertex Shader ready but not running Time */ + 4, /* A5: # VS Threads Loaded */ + 17, /* A20: Early Stencil Test Pixels Failing */ + 16, /* A19: Early Stencil Test Pixels Passing */ + 15, /* A18: Early Z Test Pixels Failing */ + 14, /* A17: Early Z Test Pixels Passing */ + 13, /* A16: Pixel Shader ready but not running Time */ + 12, /* A15: # PS threads loaded */ + 11, /* A14: Pixel Shader Stall Time - Core Stall */ + -1, /* A13: (not available) */ + 25, /* A28: SF active and stalled */ + 24, /* A27: CL active and not stalled */ + 23, /* A26: GPU Busy */ + 22, /* A25: Pixels/samples Written in the frame buffer */ + 21, /* A24: Post PS Z buffer Pixels Failed */ + 20, /* A23: Post PS Stencil Pixels Failed */ + 19, /* A22: Alpha Test Pixels Failed */ + 18, /* A21: Pixel Kill Count */ +}; + +static const struct gl_perf_monitor_counter gen6_statistics_counters[] = { + COUNTER64("IA_VERTICES_COUNT"), + COUNTER64("IA_PRIMITIVES_COUNT"), + COUNTER64("VS_INVOCATION_COUNT"), + COUNTER64("GS_INVOCATION_COUNT"), + COUNTER64("GS_PRIMITIVES_COUNT"), + COUNTER64("CL_INVOCATION_COUNT"), + COUNTER64("CL_PRIMITIVES_COUNT"), + COUNTER64("PS_INVOCATION_COUNT"), + COUNTER64("PS_DEPTH_COUNT"), + COUNTER64("SO_NUM_PRIMS_WRITTEN"), + COUNTER64("SO_PRIM_STORAGE_NEEDED"), +}; + +/** MMIO register addresses for each pipeline statistics counter. */ +static const int gen6_statistics_register_addresses[] = { + IA_VERTICES_COUNT, + IA_PRIMITIVES_COUNT, + VS_INVOCATION_COUNT, + GS_INVOCATION_COUNT, + GS_PRIMITIVES_COUNT, + CL_INVOCATION_COUNT, + CL_PRIMITIVES_COUNT, + PS_INVOCATION_COUNT, + PS_DEPTH_COUNT, + GEN6_SO_NUM_PRIMS_WRITTEN, + GEN6_SO_PRIM_STORAGE_NEEDED, +}; + +static const struct gl_perf_monitor_group gen6_groups[] = { + GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters), + GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters), }; /** @} */ @@ -106,7 +353,168 @@ const static struct gl_perf_monitor_group gen6_groups[] = { * Ivybridge/Baytrail/Haswell: * @{ */ -const static struct gl_perf_monitor_group gen7_groups[] = { +static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { + COUNTER("Aggregated Core Array Active"), + COUNTER("Aggregated Core Array Stalled"), + COUNTER("Vertex Shader Active Time"), + COUNTER("Vertex Shader Stall Time - Core Stall"), + COUNTER("# VS threads loaded"), + COUNTER("Hull Shader Active Time"), + COUNTER("Hull Shader Stall Time - Core Stall"), + COUNTER("# HS threads loaded"), + COUNTER("Domain Shader Active Time"), + COUNTER("Domain Shader Stall Time - Core Stall"), + COUNTER("# DS threads loaded"), + COUNTER("Compute Shader Active Time"), + COUNTER("Compute Shader Stall Time - Core Stall"), + COUNTER("# CS threads loaded"), + COUNTER("Geometry Shader Active Time"), + COUNTER("Geometry Shader Stall Time - Core Stall"), + COUNTER("# GS threads loaded"), + COUNTER("Pixel Shader Active Time"), + COUNTER("Pixel Shader Stall Time - Core Stall"), + COUNTER("# PS threads loaded"), + COUNTER("HiZ Fast Z Test Pixels Passing"), + COUNTER("HiZ Fast Z Test Pixels Failing"), + COUNTER("Slow Z Test Pixels Passing"), + COUNTER("Slow Z Test Pixels Failing"), + COUNTER("Pixel Kill Count"), + COUNTER("Alpha Test Pixels Failed"), + COUNTER("Post PS Stencil Pixels Failed"), + COUNTER("Post PS Z buffer Pixels Failed"), + COUNTER("3D/GPGPU Render Target Writes"), + COUNTER("Render Engine Busy"), + COUNTER("VS bottleneck"), + COUNTER("GS bottleneck"), +}; + +/** + * Ivybridge/Baytrail/Haswell: Counter Select = 101 + * A4 A3 A2 A1 A0 TIMESTAMP ReportID + * A12 A11 A10 A9 A8 A7 A6 A5 + * A20 A19 A18 A17 A16 A15 A14 A13 + * A28 A27 A26 A25 A24 A23 A22 A21 + * A36 A35 A34 A33 A32 A31 A30 A29 + * A44 A43 A42 A41 A40 A39 A38 A37 + * B7 B6 B5 B4 B3 B2 B1 B0 + * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv + */ +static const int gen7_oa_snapshot_layout[] = +{ + -1, /* Report ID */ + -1, /* TIMESTAMP (64-bit) */ + -1, /* ...second half... */ + 0, /* A0: Aggregated Core Array Active */ + 1, /* A1: Aggregated Core Array Stalled */ + 2, /* A2: Vertex Shader Active Time */ + -1, /* A3: Reserved */ + 3, /* A4: Vertex Shader Stall Time - Core Stall */ + 4, /* A5: # VS threads loaded */ + -1, /* A6: Reserved */ + 5, /* A7: Hull Shader Active Time */ + -1, /* A8: Reserved */ + 6, /* A9: Hull Shader Stall Time - Core Stall */ + 7, /* A10: # HS threads loaded */ + -1, /* A11: Reserved */ + 8, /* A12: Domain Shader Active Time */ + -1, /* A13: Reserved */ + 9, /* A14: Domain Shader Stall Time - Core Stall */ + 10, /* A15: # DS threads loaded */ + -1, /* A16: Reserved */ + 11, /* A17: Compute Shader Active Time */ + -1, /* A18: Reserved */ + 12, /* A19: Compute Shader Stall Time - Core Stall */ + 13, /* A20: # CS threads loaded */ + -1, /* A21: Reserved */ + 14, /* A22: Geometry Shader Active Time */ + -1, /* A23: Reserved */ + 15, /* A24: Geometry Shader Stall Time - Core Stall */ + 16, /* A25: # GS threads loaded */ + -1, /* A26: Reserved */ + 17, /* A27: Pixel Shader Active Time */ + -1, /* A28: Reserved */ + 18, /* A29: Pixel Shader Stall Time - Core Stall */ + 19, /* A30: # PS threads loaded */ + -1, /* A31: Reserved */ + 20, /* A32: HiZ Fast Z Test Pixels Passing */ + 21, /* A33: HiZ Fast Z Test Pixels Failing */ + 22, /* A34: Slow Z Test Pixels Passing */ + 23, /* A35: Slow Z Test Pixels Failing */ + 24, /* A36: Pixel Kill Count */ + 25, /* A37: Alpha Test Pixels Failed */ + 26, /* A38: Post PS Stencil Pixels Failed */ + 27, /* A39: Post PS Z buffer Pixels Failed */ + 28, /* A40: 3D/GPGPU Render Target Writes */ + 29, /* A41: Render Engine Busy */ + 30, /* A42: VS bottleneck */ + 31, /* A43: GS bottleneck */ + -1, /* A44: Reserved */ + -1, /* B0 */ + -1, /* B1 */ + -1, /* B2 */ + -1, /* B3 */ + -1, /* B4 */ + -1, /* B5 */ + -1, /* B6 */ + -1, /* B7 */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ + -1, /* Reserved */ +}; + +static const struct gl_perf_monitor_counter gen7_statistics_counters[] = { + COUNTER64("IA_VERTICES_COUNT"), + COUNTER64("IA_PRIMITIVES_COUNT"), + COUNTER64("VS_INVOCATION_COUNT"), + COUNTER64("HS_INVOCATION_COUNT"), + COUNTER64("DS_INVOCATION_COUNT"), + COUNTER64("GS_INVOCATION_COUNT"), + COUNTER64("GS_PRIMITIVES_COUNT"), + COUNTER64("CL_INVOCATION_COUNT"), + COUNTER64("CL_PRIMITIVES_COUNT"), + COUNTER64("PS_INVOCATION_COUNT"), + COUNTER64("PS_DEPTH_COUNT"), + COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"), + COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"), + COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"), + COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"), + COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"), + COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"), + COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"), + COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"), +}; + +/** MMIO register addresses for each pipeline statistics counter. */ +static const int gen7_statistics_register_addresses[] = { + IA_VERTICES_COUNT, + IA_PRIMITIVES_COUNT, + VS_INVOCATION_COUNT, + HS_INVOCATION_COUNT, + DS_INVOCATION_COUNT, + GS_INVOCATION_COUNT, + GS_PRIMITIVES_COUNT, + CL_INVOCATION_COUNT, + CL_PRIMITIVES_COUNT, + PS_INVOCATION_COUNT, + PS_DEPTH_COUNT, + GEN7_SO_NUM_PRIMS_WRITTEN(0), + GEN7_SO_NUM_PRIMS_WRITTEN(1), + GEN7_SO_NUM_PRIMS_WRITTEN(2), + GEN7_SO_NUM_PRIMS_WRITTEN(3), + GEN7_SO_PRIM_STORAGE_NEEDED(0), + GEN7_SO_PRIM_STORAGE_NEEDED(1), + GEN7_SO_PRIM_STORAGE_NEEDED(2), + GEN7_SO_PRIM_STORAGE_NEEDED(3), +}; + +static const struct gl_perf_monitor_group gen7_groups[] = { + GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters), + GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters), }; /** @} */ @@ -117,26 +525,532 @@ static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struc static void dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void) { + struct brw_context *brw = brw_void; struct gl_context *ctx = brw_void; struct gl_perf_monitor_object *m = monitor_void; + struct brw_perf_monitor_object *monitor = monitor_void; - DBG("%4d %-7s %-6s %-11s\n", + const char *resolved = ""; + for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { + if (brw->perfmon.unresolved[i] == monitor) { + resolved = "Unresolved"; + break; + } + } + + DBG("%4d %-7s %-6s %-10s %-11s <%3d, %3d, %3d> %-6s %-9s\n", name, m->Active ? "Active" : "", m->Ended ? "Ended" : "", - brw_is_perf_monitor_result_available(ctx, m) ? "Available" : ""); + resolved, + brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "", + monitor->oa_head_end, + monitor->oa_middle_start, + monitor->oa_tail_start, + monitor->oa_bo ? "OA BO" : "", + monitor->pipeline_stats_bo ? "Stats BO" : ""); } void brw_dump_perf_monitors(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; - DBG("Monitors:\n"); + DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users); _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw); } /******************************************************************************/ +static bool +monitor_needs_statistics_registers(struct brw_context *brw, + struct gl_perf_monitor_object *m) +{ + return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS]; +} + +/** + * Take a snapshot of any monitored pipeline statistics counters. + */ +static void +snapshot_statistics_registers(struct brw_context *brw, + struct brw_perf_monitor_object *monitor, + uint32_t offset) +{ + struct gl_context *ctx = &brw->ctx; + const int group = PIPELINE_STATS_COUNTERS; + const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters; + + brw_emit_mi_flush(brw); + + for (int i = 0; i < num_counters; i++) { + if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) { + assert(ctx->PerfMonitor.Groups[group].Counters[i].Type == + GL_UNSIGNED_INT64_AMD); + + brw_store_register_mem64(brw, monitor->pipeline_stats_bo, + brw->perfmon.statistics_registers[i], + offset + i * sizeof(uint64_t)); + } + } +} + +/** + * Gather results from pipeline_stats_bo, storing the final values. + * + * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much + * smaller array of final results. + */ +static void +gather_statistics_results(struct brw_context *brw, + struct brw_perf_monitor_object *monitor) +{ + struct gl_context *ctx = &brw->ctx; + const int num_counters = + ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters; + + monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t)); + if (monitor->pipeline_stats_results == NULL) { + _mesa_error_no_memory(__func__); + return; + } + + drm_intel_bo_map(monitor->pipeline_stats_bo, false); + uint64_t *start = monitor->pipeline_stats_bo->virtual; + uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t)); + + for (int i = 0; i < num_counters; i++) { + monitor->pipeline_stats_results[i] = end[i] - start[i]; + } + drm_intel_bo_unmap(monitor->pipeline_stats_bo); + drm_intel_bo_unreference(monitor->pipeline_stats_bo); + monitor->pipeline_stats_bo = NULL; +} + +/******************************************************************************/ + +static bool +monitor_needs_oa(struct brw_context *brw, + struct gl_perf_monitor_object *m) +{ + return m->ActiveGroups[OA_COUNTERS]; +} + +/** + * Enable the Observability Architecture counters by whacking OACONTROL. + */ +static void +start_oa_counters(struct brw_context *brw) +{ + unsigned counter_format; + + /* Pick the counter format which gives us all the counters. */ + switch (brw->gen) { + case 5: + return; /* Ironlake counters are always running. */ + case 6: + counter_format = 0b001; + break; + case 7: + counter_format = 0b101; + break; + default: + unreachable("Tried to enable OA counters on an unsupported generation."); + } + + BEGIN_BATCH(3); + OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); + OUT_BATCH(OACONTROL); + OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT | + OACONTROL_ENABLE_COUNTERS); + ADVANCE_BATCH(); +} + +/** + * Disable OA counters. + */ +static void +stop_oa_counters(struct brw_context *brw) +{ + /* Ironlake counters never stop. */ + if (brw->gen == 5) + return; + + BEGIN_BATCH(3); + OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); + OUT_BATCH(OACONTROL); + OUT_BATCH(0); + ADVANCE_BATCH(); +} + +/** + * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot, + * including the required PIPE_CONTROL flushes. + * + * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four + * PIPE_CONTROLs which are 5 DWords each. We have to flush before and after + * MI_REPORT_PERF_COUNT, so multiply by two. Finally, add the 3 DWords for + * MI_REPORT_PERF_COUNT itself. + */ +#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3) + +/** + * Emit an MI_REPORT_PERF_COUNT command packet. + * + * This writes the current OA counter values to buffer. + */ +static void +emit_mi_report_perf_count(struct brw_context *brw, + drm_intel_bo *bo, + uint32_t offset_in_bytes, + uint32_t report_id) +{ + assert(offset_in_bytes % 64 == 0); + + /* Make sure the commands to take a snapshot fits in a single batch. */ + intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4, + RENDER_RING); + int batch_used = USED_BATCH(brw->batch); + + /* Reports apparently don't always get written unless we flush first. */ + brw_emit_mi_flush(brw); + + if (brw->gen == 5) { + /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all + * the counters. The report ID is ignored in the second set. + */ + BEGIN_BATCH(6); + OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0); + OUT_RELOC(bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset_in_bytes); + OUT_BATCH(report_id); + + OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1); + OUT_RELOC(bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset_in_bytes + 64); + OUT_BATCH(report_id); + ADVANCE_BATCH(); + } else if (brw->gen == 6) { + BEGIN_BATCH(3); + OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT); + OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset_in_bytes | MI_COUNTER_ADDRESS_GTT); + OUT_BATCH(report_id); + ADVANCE_BATCH(); + } else if (brw->gen == 7) { + BEGIN_BATCH(3); + OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT); + OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset_in_bytes); + OUT_BATCH(report_id); + ADVANCE_BATCH(); + } else { + unreachable("Unsupported generation for performance counters."); + } + + /* Reports apparently don't always get written unless we flush after. */ + brw_emit_mi_flush(brw); + + (void) batch_used; + assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4); +} + +/** + * Add a monitor to the global list of "unresolved monitors." + * + * Monitors are "unresolved" if they refer to OA counter snapshots in + * bookend_bo. Results (even partial ones) must be gathered for all + * unresolved monitors before it's safe to discard bookend_bo. + */ +static void +add_to_unresolved_monitor_list(struct brw_context *brw, + struct brw_perf_monitor_object *monitor) +{ + if (brw->perfmon.unresolved_elements >= + brw->perfmon.unresolved_array_size) { + brw->perfmon.unresolved_array_size *= 2; + brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved, + struct brw_perf_monitor_object *, + brw->perfmon.unresolved_array_size); + } + + brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor; +} + +/** + * If possible, throw away the contents of bookend BO. + * + * When all monitoring stops, and no monitors need data from bookend_bo to + * compute results, we can discard it and start writing snapshots at the + * beginning again. This helps reduce the amount of buffer wraparound. + */ +static void +clean_bookend_bo(struct brw_context *brw) +{ + if (brw->perfmon.unresolved_elements == 0) { + DBG("***Resetting bookend snapshots to 0\n"); + brw->perfmon.bookend_snapshots = 0; + } +} + +/** + * Remove a monitor from the global list of "unresolved monitors." + * + * This can happen when: + * - We finish computing a completed monitor's results. + * - We discard unwanted monitor results. + * - A monitor's results can be computed without relying on bookend_bo. + */ +static void +drop_from_unresolved_monitor_list(struct brw_context *brw, + struct brw_perf_monitor_object *monitor) +{ + for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { + if (brw->perfmon.unresolved[i] == monitor) { + int last_elt = --brw->perfmon.unresolved_elements; + + if (i == last_elt) { + brw->perfmon.unresolved[i] = NULL; + } else { + brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt]; + } + + clean_bookend_bo(brw); + return; + } + } +} + +/** + * Given pointers to starting and ending OA snapshots, add the deltas for each + * counter to the results. + */ +static void +add_deltas(struct brw_context *brw, + struct brw_perf_monitor_object *monitor, + uint32_t *start, uint32_t *end) +{ + /* Look for expected report ID values to ensure data is present. */ + assert(start[0] == REPORT_ID); + assert(end[0] == REPORT_ID); + + /* Subtract each counter's ending and starting values, then add the + * difference to the counter's value so far. + */ + for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) { + /* When debugging, it's useful to note when the ending value is less than + * the starting value; aggregating counters should always increase in + * value (or remain unchanged). This happens periodically due to + * wraparound, but can also indicate serious problems. + */ +#ifdef DEBUG + if (end[i] < start[i]) { + int counter = brw->perfmon.oa_snapshot_layout[i]; + if (counter >= 0) { + DBG("WARNING: \"%s\" ending value was less than the starting " + "value: %u < %u (end - start = %u)\n", + brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name, + end[i], start[i], end[i] - start[i]); + } + } +#endif + monitor->oa_results[i] += end[i] - start[i]; + } +} + +/** + * Gather OA counter results (partial or full) from a series of snapshots. + * + * Monitoring can start or stop at any time, likely at some point mid-batch. + * We write snapshots for both events, storing them in monitor->oa_bo. + * + * Ideally, we would simply subtract those two snapshots to obtain the final + * counter results. Unfortunately, our hardware doesn't preserve their values + * across context switches or GPU sleep states. In order to support multiple + * concurrent OA clients, as well as reliable data across power management, + * we have to take snapshots at the start and end of batches as well. + * + * This results in a three-part sequence of (start, end) intervals: + * - The "head" is from the BeginPerfMonitor snapshot to the end of the first + * batchbuffer. + * - The "middle" is a series of (batch start, batch end) snapshots which + * bookend any batchbuffers between the ones which start/end monitoring. + * - The "tail" is from the start of the last batch where monitoring was + * active to the EndPerfMonitor snapshot. + * + * Due to wrapping in the bookend BO, we may have to accumulate partial results. + * If so, we handle the "head" and any "middle" results so far. When monitoring + * eventually ends, we handle additional "middle" batches and the "tail." + */ +static void +gather_oa_results(struct brw_context *brw, + struct brw_perf_monitor_object *monitor, + uint32_t *bookend_buffer) +{ + struct gl_perf_monitor_object *m = &monitor->base; + assert(monitor->oa_bo != NULL); + + drm_intel_bo_map(monitor->oa_bo, false); + uint32_t *monitor_buffer = monitor->oa_bo->virtual; + + /* If monitoring was entirely contained within a single batch, then the + * bookend BO is irrelevant. Just subtract monitor->bo's two snapshots. + */ + if (monitor->oa_middle_start == -1) { + add_deltas(brw, monitor, + monitor_buffer, + monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / + sizeof(uint32_t))); + drm_intel_bo_unmap(monitor->oa_bo); + return; + } + + const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot; + + /* First, add the contributions from the "head" interval: + * (snapshot taken at BeginPerfMonitor time, + * snapshot taken at the end of the first batch after monitoring began) + */ + if (monitor->oa_head_end != -1) { + assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots); + add_deltas(brw, monitor, + monitor_buffer, + bookend_buffer + snapshot_size * monitor->oa_head_end); + + /* Make sure we don't count the "head" again in the future. */ + monitor->oa_head_end = -1; + } + + /* Next, count the contributions from the "middle" batches. These are + * (batch begin, batch end) deltas while monitoring was active. + */ + int last_snapshot; + if (m->Ended) + last_snapshot = monitor->oa_tail_start; + else + last_snapshot = brw->perfmon.bookend_snapshots; + + for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) { + add_deltas(brw, monitor, + bookend_buffer + snapshot_size * s, + bookend_buffer + snapshot_size * (s + 1)); + } + + /* Finally, if the monitor has ended, we need to count the contributions of + * the "tail" interval: + * (start of the batch where monitoring ended, EndPerfMonitor snapshot) + */ + if (m->Ended) { + assert(monitor->oa_tail_start != -1); + add_deltas(brw, monitor, + bookend_buffer + snapshot_size * monitor->oa_tail_start, + monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / + sizeof(uint32_t))); + } + + drm_intel_bo_unmap(monitor->oa_bo); + + /* If the monitor has ended, then we've gathered all the results, and + * can free the monitor's OA BO. + */ + if (m->Ended) { + drm_intel_bo_unreference(monitor->oa_bo); + monitor->oa_bo = NULL; + + /* The monitor's OA result is now resolved. */ + DBG("Marking %d resolved - results gathered\n", m->Name); + drop_from_unresolved_monitor_list(brw, monitor); + } +} + +/** + * Handle running out of space in the bookend BO. + * + * When we run out of space in the bookend BO, we need to gather up partial + * results for every unresolved monitor. This allows us to free the snapshot + * data in bookend_bo, freeing up the space for reuse. We call this "wrapping." + * + * This will completely compute the result for any unresolved monitors that + * have ended. + */ +static void +wrap_bookend_bo(struct brw_context *brw) +{ + DBG("****Wrap bookend BO****\n"); + /* Note that wrapping will only occur at the start of a batch, since that's + * where we reserve space. So the current batch won't reference bookend_bo + * or any monitor BOs. This means we don't need to worry about + * synchronization. + * + * Also, EndPerfMonitor guarantees that only monitors which span multiple + * batches exist in the unresolved monitor list. + */ + assert(brw->perfmon.oa_users > 0); + + drm_intel_bo_map(brw->perfmon.bookend_bo, false); + uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual; + for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { + struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i]; + struct gl_perf_monitor_object *m = &monitor->base; + + gather_oa_results(brw, monitor, bookend_buffer); + + if (m->Ended) { + /* gather_oa_results() dropped the monitor from the unresolved list, + * throwing our indices off by one. + */ + --i; + } else { + /* When we create the new bookend_bo, snapshot #0 will be the + * beginning of another "middle" BO. + */ + monitor->oa_middle_start = 0; + assert(monitor->oa_head_end == -1); + assert(monitor->oa_tail_start == -1); + } + } + drm_intel_bo_unmap(brw->perfmon.bookend_bo); + + brw->perfmon.bookend_snapshots = 0; +} + +/* This is fairly arbitrary; the trade off is memory usage vs. extra overhead + * from wrapping. On Gen7, 32768 should be enough for 128 snapshots before + * wrapping (since each is 256 bytes). + */ +#define BOOKEND_BO_SIZE_BYTES 32768 + +/** + * Check whether bookend_bo has space for a given number of snapshots. + */ +static bool +has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots) +{ + int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t); + + /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */ + int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots; + + return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES; +} + +/** + * Write an OA counter snapshot to bookend_bo. + */ +static void +emit_bookend_snapshot(struct brw_context *brw) +{ + int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t); + int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes; + + emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes, + REPORT_ID); + ++brw->perfmon.bookend_snapshots; +} + +/******************************************************************************/ + /** * Initialize a monitor to sane starting state; throw away old buffers. */ @@ -144,6 +1058,30 @@ static void reinitialize_perf_monitor(struct brw_context *brw, struct brw_perf_monitor_object *monitor) { + if (monitor->oa_bo) { + drm_intel_bo_unreference(monitor->oa_bo); + monitor->oa_bo = NULL; + } + + /* Since the results are now invalid, we don't need to hold on to any + * snapshots in bookend_bo. The monitor is effectively "resolved." + */ + drop_from_unresolved_monitor_list(brw, monitor); + + monitor->oa_head_end = -1; + monitor->oa_middle_start = -1; + monitor->oa_tail_start = -1; + + free(monitor->oa_results); + monitor->oa_results = NULL; + + if (monitor->pipeline_stats_bo) { + drm_intel_bo_unreference(monitor->pipeline_stats_bo); + monitor->pipeline_stats_bo = NULL; + } + + free(monitor->pipeline_stats_results); + monitor->pipeline_stats_results = NULL; } /** @@ -160,6 +1098,59 @@ brw_begin_perf_monitor(struct gl_context *ctx, reinitialize_perf_monitor(brw, monitor); + if (monitor_needs_oa(brw, m)) { + /* If the global OA bookend BO doesn't exist, allocate it. This should + * only happen once, but we delay until BeginPerfMonitor time to avoid + * wasting memory for contexts that don't use performance monitors. + */ + if (!brw->perfmon.bookend_bo) { + brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr, + "OA bookend BO", + BOOKEND_BO_SIZE_BYTES, 64); + } + + monitor->oa_bo = + drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64); +#ifdef DEBUG + /* Pre-filling the BO helps debug whether writes landed. */ + drm_intel_bo_map(monitor->oa_bo, true); + memset((char *) monitor->oa_bo->virtual, 0xff, 4096); + drm_intel_bo_unmap(monitor->oa_bo); +#endif + + /* Allocate storage for accumulated OA counter values. */ + monitor->oa_results = + calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t)); + + /* If the OA counters aren't already on, enable them. */ + if (brw->perfmon.oa_users == 0) { + /* Ensure the OACONTROL enable and snapshot land in the same batch. */ + int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4; + intel_batchbuffer_require_space(brw, space, RENDER_RING); + start_oa_counters(brw); + } + + /* Take a starting OA counter snapshot. */ + emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID); + + monitor->oa_head_end = brw->perfmon.bookend_snapshots; + monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1; + monitor->oa_tail_start = -1; + + /* Add the monitor to the unresolved list. */ + add_to_unresolved_monitor_list(brw, monitor); + + ++brw->perfmon.oa_users; + } + + if (monitor_needs_statistics_registers(brw, m)) { + monitor->pipeline_stats_bo = + drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64); + + /* Take starting snapshots. */ + snapshot_statistics_registers(brw, monitor, 0); + } + return true; } @@ -170,7 +1161,49 @@ static void brw_end_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m) { + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + DBG("End(%d)\n", m->Name); + + if (monitor_needs_oa(brw, m)) { + /* Take an ending OA counter snapshot. */ + emit_mi_report_perf_count(brw, monitor->oa_bo, + SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID); + + --brw->perfmon.oa_users; + + if (brw->perfmon.oa_users == 0) + stop_oa_counters(brw); + + if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) { + assert(monitor->oa_head_end != -1); + /* We never actually wrote the snapshot for the end of the first batch + * after BeginPerfMonitor. This means that monitoring was contained + * entirely within a single batch, so we can ignore bookend_bo and + * just compare the monitor's begin/end snapshots directly. + */ + monitor->oa_head_end = -1; + monitor->oa_middle_start = -1; + monitor->oa_tail_start = -1; + + /* We can also mark it resolved since it won't depend on bookend_bo. */ + DBG("Marking %d resolved - entirely in one batch\n", m->Name); + drop_from_unresolved_monitor_list(brw, monitor); + } else { + /* We've written at least one batch end snapshot, so the monitoring + * spanned multiple batches. Mark which snapshot corresponds to the + * start of the current batch. + */ + monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1; + } + } + + if (monitor_needs_statistics_registers(brw, m)) { + /* Take ending snapshots. */ + snapshot_statistics_registers(brw, monitor, + SECOND_SNAPSHOT_OFFSET_IN_BYTES); + } } /** @@ -197,9 +1230,25 @@ static GLboolean brw_is_perf_monitor_result_available(struct gl_context *ctx, struct gl_perf_monitor_object *m) { - /* ...need to actually check if counters are available, once we have some. */ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); - return true; + bool oa_available = true; + bool stats_available = true; + + if (monitor_needs_oa(brw, m)) { + oa_available = !monitor->oa_bo || + (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) && + !drm_intel_bo_busy(monitor->oa_bo)); + } + + if (monitor_needs_statistics_registers(brw, m)) { + stats_available = !monitor->pipeline_stats_bo || + (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) && + !drm_intel_bo_busy(monitor->pipeline_stats_bo)); + } + + return oa_available && stats_available; } /** @@ -212,7 +1261,12 @@ brw_get_perf_monitor_result(struct gl_context *ctx, GLuint *data, GLint *bytes_written) { + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size); + DBG("GetResult(%d)\n", m->Name); + brw_dump_perf_monitors(brw); /* This hook should only be called when results are available. */ assert(m->Ended); @@ -224,7 +1278,74 @@ brw_get_perf_monitor_result(struct gl_context *ctx, */ GLsizei offset = 0; - /* ...but, we don't actually expose anything yet, so nothing to do here */ + if (monitor_needs_oa(brw, m)) { + /* Gather up the results from the BO, unless we already did due to the + * bookend BO wrapping. + */ + if (monitor->oa_bo) { + /* Since the result is available, all the necessary snapshots will + * have been written to the bookend BO. If other monitors are + * active, the bookend BO may be busy or referenced by the current + * batch, but only for writing snapshots beyond oa_tail_start, + * which we don't care about. + * + * Using an unsynchronized mapping avoids stalling for an + * indeterminate amount of time. + */ + drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo); + + gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual); + + drm_intel_bo_unmap(brw->perfmon.bookend_bo); + } + + for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) { + int group = OA_COUNTERS; + int counter = brw->perfmon.oa_snapshot_layout[i]; + + /* We always capture all the OA counters, but the application may + * have only asked for a subset. Skip unwanted counters. + */ + if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter)) + continue; + + if (data + offset + 3 <= data_end) { + data[offset++] = group; + data[offset++] = counter; + data[offset++] = monitor->oa_results[i]; + } + } + + clean_bookend_bo(brw); + } + + if (monitor_needs_statistics_registers(brw, m)) { + const int num_counters = + ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters; + + if (!monitor->pipeline_stats_results) { + gather_statistics_results(brw, monitor); + + /* Check if we did really get the results */ + if (!monitor->pipeline_stats_results) { + if (bytes_written) { + *bytes_written = 0; + } + return; + } + } + + for (int i = 0; i < num_counters; i++) { + if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) { + if (data + offset + 4 <= data_end) { + data[offset++] = PIPELINE_STATS_COUNTERS; + data[offset++] = i; + *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i]; + offset += 2; + } + } + } + } if (bytes_written) *bytes_written = offset * sizeof(uint32_t); @@ -236,6 +1357,7 @@ brw_get_perf_monitor_result(struct gl_context *ctx, static struct gl_perf_monitor_object * brw_new_perf_monitor(struct gl_context *ctx) { + (void) ctx; return calloc(1, sizeof(struct brw_perf_monitor_object)); } @@ -253,6 +1375,63 @@ brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m /******************************************************************************/ +/** + * Called at the start of every render ring batch. + * + * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot. + * Since it's a new batch, there will be plenty of space for the commands. + */ +void +brw_perf_monitor_new_batch(struct brw_context *brw) +{ + assert(brw->batch.ring == RENDER_RING); + assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0); + + if (brw->perfmon.oa_users == 0) + return; + + start_oa_counters(brw); + + /* Make sure bookend_bo has enough space for a pair of snapshots. + * If not, "wrap" the BO: gather up any results so far, and start from + * the beginning of the buffer. Reserving a pair guarantees that wrapping + * will only happen at the beginning of a batch, where it's safe to map BOs + * (as the batch is empty and can't refer to any of them yet). + */ + if (!has_space_for_bookend_snapshots(brw, 2)) + wrap_bookend_bo(brw); + + DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots); + emit_bookend_snapshot(brw); +} + +/** + * Called at the end of every render ring batch. + * + * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters. + * + * This relies on there being enough space in BATCH_RESERVED. + */ +void +brw_perf_monitor_finish_batch(struct brw_context *brw) +{ + assert(brw->batch.ring == RENDER_RING); + + if (brw->perfmon.oa_users == 0) + return; + + DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots); + + /* Not safe to wrap; should've reserved space already. */ + assert(has_space_for_bookend_snapshots(brw, 1)); + + emit_bookend_snapshot(brw); + + stop_oa_counters(brw); +} + +/******************************************************************************/ + void brw_init_performance_monitors(struct brw_context *brw) { @@ -269,11 +1448,24 @@ brw_init_performance_monitors(struct brw_context *brw) if (brw->gen == 5) { ctx->PerfMonitor.Groups = gen5_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups); + brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout); } else if (brw->gen == 6) { ctx->PerfMonitor.Groups = gen6_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups); + brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout); + brw->perfmon.statistics_registers = gen6_statistics_register_addresses; } else if (brw->gen == 7) { ctx->PerfMonitor.Groups = gen7_groups; ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups); + brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout; + brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout); + brw->perfmon.statistics_registers = gen7_statistics_register_addresses; } + + brw->perfmon.unresolved = + ralloc_array(brw, struct brw_perf_monitor_object *, 1); + brw->perfmon.unresolved_elements = 0; + brw->perfmon.unresolved_array_size = 1; }