X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_performance_monitor.c;h=f8e50e10fa3a611c4bc6d9944870044513a18e18;hb=0e657b7b55bc7c83c8eb5258cd9522b0e5e581b7;hp=e5d214e968861feb74d1647249bdfdc0fefe2e27;hpb=093ecbfe3b8339ef846f46c2a04d32856273a2d7;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c index e5d214e9688..f8e50e10fa3 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -44,13 +44,13 @@ #include -#include "main/bitset.h" +#include "util/bitset.h" #include "main/hash.h" #include "main/macros.h" #include "main/mtypes.h" #include "main/performance_monitor.h" -#include "glsl/ralloc.h" +#include "util/ralloc.h" #include "brw_context.h" #include "brw_defines.h" @@ -66,6 +66,26 @@ struct brw_perf_monitor_object /** The base class. */ struct gl_perf_monitor_object base; + /** + * BO containing OA counter snapshots at monitor Begin/End time. + */ + drm_intel_bo *oa_bo; + + /** Indexes into bookend_bo (snapshot numbers) for various segments. */ + int oa_head_end; + int oa_middle_start; + int oa_tail_start; + + /** + * Storage for OA results accumulated so far. + * + * An array indexed by the counter ID in the OA_COUNTERS group. + * + * When we run out of space in bookend_bo, we compute the results so far + * and add them to the value stored here. Then, we can discard bookend_bo. + */ + uint32_t *oa_results; + /** * BO containing starting and ending snapshots for any active pipeline * statistics counters. @@ -87,6 +107,9 @@ brw_perf_monitor(struct gl_perf_monitor_object *m) #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048 +/* A random value used to ensure we're getting valid snapshots. */ +#define REPORT_ID 0xd2e9c607 + /******************************************************************************/ #define COUNTER(name) \ @@ -127,7 +150,7 @@ enum brw_counter_groups { * documentation, but is available by reading the source code for the * intel_perf_counters utility (shipped as part of intel-gpu-tools). */ -const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { +static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { COUNTER("cycles the CS unit is starved"), COUNTER("cycles the CS unit is stalled"), COUNTER("cycles the VF unit is starved"), @@ -159,7 +182,7 @@ const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = { COUNTER("cycles any EU is stalled for math"), }; -const static int gen5_oa_snapshot_layout[] = +static const int gen5_oa_snapshot_layout[] = { -1, /* Report ID */ -1, /* TIMESTAMP (64-bit) */ @@ -195,7 +218,7 @@ const static int gen5_oa_snapshot_layout[] = 28, /* cycles any EU is stalled for math */ }; -const static struct gl_perf_monitor_group gen5_groups[] = { +static const struct gl_perf_monitor_group gen5_groups[] = { [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters), /* Our pipeline statistics counter handling requires hardware contexts. */ }; @@ -214,7 +237,7 @@ const static struct gl_perf_monitor_group gen5_groups[] = { /** * Aggregating counters A0-A28: */ -const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { +static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { /* A0: 0 */ COUNTER("Aggregated Core Array Active"), /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"), /* A2: 2 */ COUNTER("Vertex Shader Active Time"), @@ -255,7 +278,7 @@ const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = { * * (Yes, this is a strange order.) We also have to remap for missing counters. */ -const static int gen6_oa_snapshot_layout[] = +static const int gen6_oa_snapshot_layout[] = { -1, /* Report ID */ -1, /* TIMESTAMP (64-bit) */ @@ -291,7 +314,7 @@ const static int gen6_oa_snapshot_layout[] = 18, /* A21: Pixel Kill Count */ }; -const static struct gl_perf_monitor_counter gen6_statistics_counters[] = { +static const struct gl_perf_monitor_counter gen6_statistics_counters[] = { COUNTER64("IA_VERTICES_COUNT"), COUNTER64("IA_PRIMITIVES_COUNT"), COUNTER64("VS_INVOCATION_COUNT"), @@ -306,7 +329,7 @@ const static struct gl_perf_monitor_counter gen6_statistics_counters[] = { }; /** MMIO register addresses for each pipeline statistics counter. */ -const static int gen6_statistics_register_addresses[] = { +static const int gen6_statistics_register_addresses[] = { IA_VERTICES_COUNT, IA_PRIMITIVES_COUNT, VS_INVOCATION_COUNT, @@ -320,7 +343,7 @@ const static int gen6_statistics_register_addresses[] = { GEN6_SO_PRIM_STORAGE_NEEDED, }; -const static struct gl_perf_monitor_group gen6_groups[] = { +static const struct gl_perf_monitor_group gen6_groups[] = { GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters), GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters), }; @@ -330,7 +353,7 @@ const static struct gl_perf_monitor_group gen6_groups[] = { * Ivybridge/Baytrail/Haswell: * @{ */ -const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { +static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { COUNTER("Aggregated Core Array Active"), COUNTER("Aggregated Core Array Stalled"), COUNTER("Vertex Shader Active Time"), @@ -376,7 +399,7 @@ const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = { * B7 B6 B5 B4 B3 B2 B1 B0 * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv */ -const static int gen7_oa_snapshot_layout[] = +static const int gen7_oa_snapshot_layout[] = { -1, /* Report ID */ -1, /* TIMESTAMP (64-bit) */ @@ -444,7 +467,7 @@ const static int gen7_oa_snapshot_layout[] = -1, /* Reserved */ }; -const static struct gl_perf_monitor_counter gen7_statistics_counters[] = { +static const struct gl_perf_monitor_counter gen7_statistics_counters[] = { COUNTER64("IA_VERTICES_COUNT"), COUNTER64("IA_PRIMITIVES_COUNT"), COUNTER64("VS_INVOCATION_COUNT"), @@ -467,7 +490,7 @@ const static struct gl_perf_monitor_counter gen7_statistics_counters[] = { }; /** MMIO register addresses for each pipeline statistics counter. */ -const static int gen7_statistics_register_addresses[] = { +static const int gen7_statistics_register_addresses[] = { IA_VERTICES_COUNT, IA_PRIMITIVES_COUNT, VS_INVOCATION_COUNT, @@ -489,7 +512,7 @@ const static int gen7_statistics_register_addresses[] = { GEN7_SO_PRIM_STORAGE_NEEDED(3), }; -const static struct gl_perf_monitor_group gen7_groups[] = { +static const struct gl_perf_monitor_group gen7_groups[] = { GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters), GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters), }; @@ -502,15 +525,29 @@ static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struc static void dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void) { + struct brw_context *brw = brw_void; struct gl_context *ctx = brw_void; struct gl_perf_monitor_object *m = monitor_void; struct brw_perf_monitor_object *monitor = monitor_void; - DBG("%4d %-7s %-6s %-11s %-9s\n", + const char *resolved = ""; + for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { + if (brw->perfmon.unresolved[i] == monitor) { + resolved = "Unresolved"; + break; + } + } + + DBG("%4d %-7s %-6s %-10s %-11s <%3d, %3d, %3d> %-6s %-9s\n", name, m->Active ? "Active" : "", m->Ended ? "Ended" : "", + resolved, brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "", + monitor->oa_head_end, + monitor->oa_middle_start, + monitor->oa_tail_start, + monitor->oa_bo ? "OA BO" : "", monitor->pipeline_stats_bo ? "Stats BO" : ""); } @@ -537,14 +574,13 @@ monitor_needs_statistics_registers(struct brw_context *brw, static void snapshot_statistics_registers(struct brw_context *brw, struct brw_perf_monitor_object *monitor, - uint32_t offset_in_bytes) + uint32_t offset) { struct gl_context *ctx = &brw->ctx; - const int offset = offset_in_bytes / sizeof(uint64_t); const int group = PIPELINE_STATS_COUNTERS; const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters; - intel_batchbuffer_emit_mi_flush(brw); + brw_emit_mi_flush(brw); for (int i = 0; i < num_counters; i++) { if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) { @@ -553,7 +589,7 @@ snapshot_statistics_registers(struct brw_context *brw, brw_store_register_mem64(brw, monitor->pipeline_stats_bo, brw->perfmon.statistics_registers[i], - offset + i); + offset + i * sizeof(uint64_t)); } } } @@ -573,6 +609,10 @@ gather_statistics_results(struct brw_context *brw, ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters; monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t)); + if (monitor->pipeline_stats_results == NULL) { + _mesa_error_no_memory(__func__); + return; + } drm_intel_bo_map(monitor->pipeline_stats_bo, false); uint64_t *start = monitor->pipeline_stats_bo->virtual; @@ -595,16 +635,63 @@ monitor_needs_oa(struct brw_context *brw, return m->ActiveGroups[OA_COUNTERS]; } +/** + * Enable the Observability Architecture counters by whacking OACONTROL. + */ +static void +start_oa_counters(struct brw_context *brw) +{ + unsigned counter_format; + + /* Pick the counter format which gives us all the counters. */ + switch (brw->gen) { + case 5: + return; /* Ironlake counters are always running. */ + case 6: + counter_format = 0b001; + break; + case 7: + counter_format = 0b101; + break; + default: + unreachable("Tried to enable OA counters on an unsupported generation."); + } + + BEGIN_BATCH(3); + OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); + OUT_BATCH(OACONTROL); + OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT | + OACONTROL_ENABLE_COUNTERS); + ADVANCE_BATCH(); +} + +/** + * Disable OA counters. + */ +static void +stop_oa_counters(struct brw_context *brw) +{ + /* Ironlake counters never stop. */ + if (brw->gen == 5) + return; + + BEGIN_BATCH(3); + OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); + OUT_BATCH(OACONTROL); + OUT_BATCH(0); + ADVANCE_BATCH(); +} + /** * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot, * including the required PIPE_CONTROL flushes. * - * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush - * expands to three PIPE_CONTROLs which are 4 DWords each. We have to flush - * before and after MI_REPORT_PERF_COUNT, so multiply by two. Finally, add - * the 3 DWords for MI_REPORT_PERF_COUNT itself. + * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four + * PIPE_CONTROLs which are 5 DWords each. We have to flush before and after + * MI_REPORT_PERF_COUNT, so multiply by two. Finally, add the 3 DWords for + * MI_REPORT_PERF_COUNT itself. */ -#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3) +#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3) /** * Emit an MI_REPORT_PERF_COUNT command packet. @@ -622,10 +709,10 @@ emit_mi_report_perf_count(struct brw_context *brw, /* Make sure the commands to take a snapshot fits in a single batch. */ intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4, RENDER_RING); - int batch_used = brw->batch.used; + int batch_used = USED_BATCH(brw->batch); /* Reports apparently don't always get written unless we flush first. */ - intel_batchbuffer_emit_mi_flush(brw); + brw_emit_mi_flush(brw); if (brw->gen == 5) { /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all @@ -659,14 +746,307 @@ emit_mi_report_perf_count(struct brw_context *brw, OUT_BATCH(report_id); ADVANCE_BATCH(); } else { - assert(!"Unsupported generation for performance counters."); + unreachable("Unsupported generation for performance counters."); } /* Reports apparently don't always get written unless we flush after. */ - intel_batchbuffer_emit_mi_flush(brw); + brw_emit_mi_flush(brw); (void) batch_used; - assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4); + assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4); +} + +/** + * Add a monitor to the global list of "unresolved monitors." + * + * Monitors are "unresolved" if they refer to OA counter snapshots in + * bookend_bo. Results (even partial ones) must be gathered for all + * unresolved monitors before it's safe to discard bookend_bo. + */ +static void +add_to_unresolved_monitor_list(struct brw_context *brw, + struct brw_perf_monitor_object *monitor) +{ + if (brw->perfmon.unresolved_elements >= + brw->perfmon.unresolved_array_size) { + brw->perfmon.unresolved_array_size *= 2; + brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved, + struct brw_perf_monitor_object *, + brw->perfmon.unresolved_array_size); + } + + brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor; +} + +/** + * If possible, throw away the contents of bookend BO. + * + * When all monitoring stops, and no monitors need data from bookend_bo to + * compute results, we can discard it and start writing snapshots at the + * beginning again. This helps reduce the amount of buffer wraparound. + */ +static void +clean_bookend_bo(struct brw_context *brw) +{ + if (brw->perfmon.unresolved_elements == 0) { + DBG("***Resetting bookend snapshots to 0\n"); + brw->perfmon.bookend_snapshots = 0; + } +} + +/** + * Remove a monitor from the global list of "unresolved monitors." + * + * This can happen when: + * - We finish computing a completed monitor's results. + * - We discard unwanted monitor results. + * - A monitor's results can be computed without relying on bookend_bo. + */ +static void +drop_from_unresolved_monitor_list(struct brw_context *brw, + struct brw_perf_monitor_object *monitor) +{ + for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { + if (brw->perfmon.unresolved[i] == monitor) { + int last_elt = --brw->perfmon.unresolved_elements; + + if (i == last_elt) { + brw->perfmon.unresolved[i] = NULL; + } else { + brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt]; + } + + clean_bookend_bo(brw); + return; + } + } +} + +/** + * Given pointers to starting and ending OA snapshots, add the deltas for each + * counter to the results. + */ +static void +add_deltas(struct brw_context *brw, + struct brw_perf_monitor_object *monitor, + uint32_t *start, uint32_t *end) +{ + /* Look for expected report ID values to ensure data is present. */ + assert(start[0] == REPORT_ID); + assert(end[0] == REPORT_ID); + + /* Subtract each counter's ending and starting values, then add the + * difference to the counter's value so far. + */ + for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) { + /* When debugging, it's useful to note when the ending value is less than + * the starting value; aggregating counters should always increase in + * value (or remain unchanged). This happens periodically due to + * wraparound, but can also indicate serious problems. + */ +#ifdef DEBUG + if (end[i] < start[i]) { + int counter = brw->perfmon.oa_snapshot_layout[i]; + if (counter >= 0) { + DBG("WARNING: \"%s\" ending value was less than the starting " + "value: %u < %u (end - start = %u)\n", + brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name, + end[i], start[i], end[i] - start[i]); + } + } +#endif + monitor->oa_results[i] += end[i] - start[i]; + } +} + +/** + * Gather OA counter results (partial or full) from a series of snapshots. + * + * Monitoring can start or stop at any time, likely at some point mid-batch. + * We write snapshots for both events, storing them in monitor->oa_bo. + * + * Ideally, we would simply subtract those two snapshots to obtain the final + * counter results. Unfortunately, our hardware doesn't preserve their values + * across context switches or GPU sleep states. In order to support multiple + * concurrent OA clients, as well as reliable data across power management, + * we have to take snapshots at the start and end of batches as well. + * + * This results in a three-part sequence of (start, end) intervals: + * - The "head" is from the BeginPerfMonitor snapshot to the end of the first + * batchbuffer. + * - The "middle" is a series of (batch start, batch end) snapshots which + * bookend any batchbuffers between the ones which start/end monitoring. + * - The "tail" is from the start of the last batch where monitoring was + * active to the EndPerfMonitor snapshot. + * + * Due to wrapping in the bookend BO, we may have to accumulate partial results. + * If so, we handle the "head" and any "middle" results so far. When monitoring + * eventually ends, we handle additional "middle" batches and the "tail." + */ +static void +gather_oa_results(struct brw_context *brw, + struct brw_perf_monitor_object *monitor, + uint32_t *bookend_buffer) +{ + struct gl_perf_monitor_object *m = &monitor->base; + assert(monitor->oa_bo != NULL); + + drm_intel_bo_map(monitor->oa_bo, false); + uint32_t *monitor_buffer = monitor->oa_bo->virtual; + + /* If monitoring was entirely contained within a single batch, then the + * bookend BO is irrelevant. Just subtract monitor->bo's two snapshots. + */ + if (monitor->oa_middle_start == -1) { + add_deltas(brw, monitor, + monitor_buffer, + monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / + sizeof(uint32_t))); + drm_intel_bo_unmap(monitor->oa_bo); + return; + } + + const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot; + + /* First, add the contributions from the "head" interval: + * (snapshot taken at BeginPerfMonitor time, + * snapshot taken at the end of the first batch after monitoring began) + */ + if (monitor->oa_head_end != -1) { + assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots); + add_deltas(brw, monitor, + monitor_buffer, + bookend_buffer + snapshot_size * monitor->oa_head_end); + + /* Make sure we don't count the "head" again in the future. */ + monitor->oa_head_end = -1; + } + + /* Next, count the contributions from the "middle" batches. These are + * (batch begin, batch end) deltas while monitoring was active. + */ + int last_snapshot; + if (m->Ended) + last_snapshot = monitor->oa_tail_start; + else + last_snapshot = brw->perfmon.bookend_snapshots; + + for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) { + add_deltas(brw, monitor, + bookend_buffer + snapshot_size * s, + bookend_buffer + snapshot_size * (s + 1)); + } + + /* Finally, if the monitor has ended, we need to count the contributions of + * the "tail" interval: + * (start of the batch where monitoring ended, EndPerfMonitor snapshot) + */ + if (m->Ended) { + assert(monitor->oa_tail_start != -1); + add_deltas(brw, monitor, + bookend_buffer + snapshot_size * monitor->oa_tail_start, + monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / + sizeof(uint32_t))); + } + + drm_intel_bo_unmap(monitor->oa_bo); + + /* If the monitor has ended, then we've gathered all the results, and + * can free the monitor's OA BO. + */ + if (m->Ended) { + drm_intel_bo_unreference(monitor->oa_bo); + monitor->oa_bo = NULL; + + /* The monitor's OA result is now resolved. */ + DBG("Marking %d resolved - results gathered\n", m->Name); + drop_from_unresolved_monitor_list(brw, monitor); + } +} + +/** + * Handle running out of space in the bookend BO. + * + * When we run out of space in the bookend BO, we need to gather up partial + * results for every unresolved monitor. This allows us to free the snapshot + * data in bookend_bo, freeing up the space for reuse. We call this "wrapping." + * + * This will completely compute the result for any unresolved monitors that + * have ended. + */ +static void +wrap_bookend_bo(struct brw_context *brw) +{ + DBG("****Wrap bookend BO****\n"); + /* Note that wrapping will only occur at the start of a batch, since that's + * where we reserve space. So the current batch won't reference bookend_bo + * or any monitor BOs. This means we don't need to worry about + * synchronization. + * + * Also, EndPerfMonitor guarantees that only monitors which span multiple + * batches exist in the unresolved monitor list. + */ + assert(brw->perfmon.oa_users > 0); + + drm_intel_bo_map(brw->perfmon.bookend_bo, false); + uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual; + for (int i = 0; i < brw->perfmon.unresolved_elements; i++) { + struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i]; + struct gl_perf_monitor_object *m = &monitor->base; + + gather_oa_results(brw, monitor, bookend_buffer); + + if (m->Ended) { + /* gather_oa_results() dropped the monitor from the unresolved list, + * throwing our indices off by one. + */ + --i; + } else { + /* When we create the new bookend_bo, snapshot #0 will be the + * beginning of another "middle" BO. + */ + monitor->oa_middle_start = 0; + assert(monitor->oa_head_end == -1); + assert(monitor->oa_tail_start == -1); + } + } + drm_intel_bo_unmap(brw->perfmon.bookend_bo); + + brw->perfmon.bookend_snapshots = 0; +} + +/* This is fairly arbitrary; the trade off is memory usage vs. extra overhead + * from wrapping. On Gen7, 32768 should be enough for 128 snapshots before + * wrapping (since each is 256 bytes). + */ +#define BOOKEND_BO_SIZE_BYTES 32768 + +/** + * Check whether bookend_bo has space for a given number of snapshots. + */ +static bool +has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots) +{ + int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t); + + /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */ + int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots; + + return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES; +} + +/** + * Write an OA counter snapshot to bookend_bo. + */ +static void +emit_bookend_snapshot(struct brw_context *brw) +{ + int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t); + int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes; + + emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes, + REPORT_ID); + ++brw->perfmon.bookend_snapshots; } /******************************************************************************/ @@ -678,6 +1058,23 @@ static void reinitialize_perf_monitor(struct brw_context *brw, struct brw_perf_monitor_object *monitor) { + if (monitor->oa_bo) { + drm_intel_bo_unreference(monitor->oa_bo); + monitor->oa_bo = NULL; + } + + /* Since the results are now invalid, we don't need to hold on to any + * snapshots in bookend_bo. The monitor is effectively "resolved." + */ + drop_from_unresolved_monitor_list(brw, monitor); + + monitor->oa_head_end = -1; + monitor->oa_middle_start = -1; + monitor->oa_tail_start = -1; + + free(monitor->oa_results); + monitor->oa_results = NULL; + if (monitor->pipeline_stats_bo) { drm_intel_bo_unreference(monitor->pipeline_stats_bo); monitor->pipeline_stats_bo = NULL; @@ -702,6 +1099,47 @@ brw_begin_perf_monitor(struct gl_context *ctx, reinitialize_perf_monitor(brw, monitor); if (monitor_needs_oa(brw, m)) { + /* If the global OA bookend BO doesn't exist, allocate it. This should + * only happen once, but we delay until BeginPerfMonitor time to avoid + * wasting memory for contexts that don't use performance monitors. + */ + if (!brw->perfmon.bookend_bo) { + brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr, + "OA bookend BO", + BOOKEND_BO_SIZE_BYTES, 64); + } + + monitor->oa_bo = + drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64); +#ifdef DEBUG + /* Pre-filling the BO helps debug whether writes landed. */ + drm_intel_bo_map(monitor->oa_bo, true); + memset((char *) monitor->oa_bo->virtual, 0xff, 4096); + drm_intel_bo_unmap(monitor->oa_bo); +#endif + + /* Allocate storage for accumulated OA counter values. */ + monitor->oa_results = + calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t)); + + /* If the OA counters aren't already on, enable them. */ + if (brw->perfmon.oa_users == 0) { + /* Ensure the OACONTROL enable and snapshot land in the same batch. */ + int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4; + intel_batchbuffer_require_space(brw, space, RENDER_RING); + start_oa_counters(brw); + } + + /* Take a starting OA counter snapshot. */ + emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID); + + monitor->oa_head_end = brw->perfmon.bookend_snapshots; + monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1; + monitor->oa_tail_start = -1; + + /* Add the monitor to the unresolved list. */ + add_to_unresolved_monitor_list(brw, monitor); + ++brw->perfmon.oa_users; } @@ -729,7 +1167,36 @@ brw_end_perf_monitor(struct gl_context *ctx, DBG("End(%d)\n", m->Name); if (monitor_needs_oa(brw, m)) { + /* Take an ending OA counter snapshot. */ + emit_mi_report_perf_count(brw, monitor->oa_bo, + SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID); + --brw->perfmon.oa_users; + + if (brw->perfmon.oa_users == 0) + stop_oa_counters(brw); + + if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) { + assert(monitor->oa_head_end != -1); + /* We never actually wrote the snapshot for the end of the first batch + * after BeginPerfMonitor. This means that monitoring was contained + * entirely within a single batch, so we can ignore bookend_bo and + * just compare the monitor's begin/end snapshots directly. + */ + monitor->oa_head_end = -1; + monitor->oa_middle_start = -1; + monitor->oa_tail_start = -1; + + /* We can also mark it resolved since it won't depend on bookend_bo. */ + DBG("Marking %d resolved - entirely in one batch\n", m->Name); + drop_from_unresolved_monitor_list(brw, monitor); + } else { + /* We've written at least one batch end snapshot, so the monitoring + * spanned multiple batches. Mark which snapshot corresponds to the + * start of the current batch. + */ + monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1; + } } if (monitor_needs_statistics_registers(brw, m)) { @@ -766,15 +1233,22 @@ brw_is_perf_monitor_result_available(struct gl_context *ctx, struct brw_context *brw = brw_context(ctx); struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + bool oa_available = true; bool stats_available = true; + if (monitor_needs_oa(brw, m)) { + oa_available = !monitor->oa_bo || + (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) && + !drm_intel_bo_busy(monitor->oa_bo)); + } + if (monitor_needs_statistics_registers(brw, m)) { stats_available = !monitor->pipeline_stats_bo || (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) && !drm_intel_bo_busy(monitor->pipeline_stats_bo)); } - return stats_available; + return oa_available && stats_available; } /** @@ -789,6 +1263,7 @@ brw_get_perf_monitor_result(struct gl_context *ctx, { struct brw_context *brw = brw_context(ctx); struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size); DBG("GetResult(%d)\n", m->Name); brw_dump_perf_monitors(brw); @@ -803,19 +1278,71 @@ brw_get_perf_monitor_result(struct gl_context *ctx, */ GLsizei offset = 0; + if (monitor_needs_oa(brw, m)) { + /* Gather up the results from the BO, unless we already did due to the + * bookend BO wrapping. + */ + if (monitor->oa_bo) { + /* Since the result is available, all the necessary snapshots will + * have been written to the bookend BO. If other monitors are + * active, the bookend BO may be busy or referenced by the current + * batch, but only for writing snapshots beyond oa_tail_start, + * which we don't care about. + * + * Using an unsynchronized mapping avoids stalling for an + * indeterminate amount of time. + */ + drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo); + + gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual); + + drm_intel_bo_unmap(brw->perfmon.bookend_bo); + } + + for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) { + int group = OA_COUNTERS; + int counter = brw->perfmon.oa_snapshot_layout[i]; + + /* We always capture all the OA counters, but the application may + * have only asked for a subset. Skip unwanted counters. + */ + if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter)) + continue; + + if (data + offset + 3 <= data_end) { + data[offset++] = group; + data[offset++] = counter; + data[offset++] = monitor->oa_results[i]; + } + } + + clean_bookend_bo(brw); + } + if (monitor_needs_statistics_registers(brw, m)) { const int num_counters = ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters; - if (!monitor->pipeline_stats_results) + if (!monitor->pipeline_stats_results) { gather_statistics_results(brw, monitor); + /* Check if we did really get the results */ + if (!monitor->pipeline_stats_results) { + if (bytes_written) { + *bytes_written = 0; + } + return; + } + } + for (int i = 0; i < num_counters; i++) { if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) { - data[offset++] = PIPELINE_STATS_COUNTERS; - data[offset++] = i; - *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i]; - offset += 2; + if (data + offset + 4 <= data_end) { + data[offset++] = PIPELINE_STATS_COUNTERS; + data[offset++] = i; + *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i]; + offset += 2; + } } } } @@ -830,6 +1357,7 @@ brw_get_perf_monitor_result(struct gl_context *ctx, static struct gl_perf_monitor_object * brw_new_perf_monitor(struct gl_context *ctx) { + (void) ctx; return calloc(1, sizeof(struct brw_perf_monitor_object)); } @@ -847,6 +1375,63 @@ brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m /******************************************************************************/ +/** + * Called at the start of every render ring batch. + * + * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot. + * Since it's a new batch, there will be plenty of space for the commands. + */ +void +brw_perf_monitor_new_batch(struct brw_context *brw) +{ + assert(brw->batch.ring == RENDER_RING); + assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0); + + if (brw->perfmon.oa_users == 0) + return; + + start_oa_counters(brw); + + /* Make sure bookend_bo has enough space for a pair of snapshots. + * If not, "wrap" the BO: gather up any results so far, and start from + * the beginning of the buffer. Reserving a pair guarantees that wrapping + * will only happen at the beginning of a batch, where it's safe to map BOs + * (as the batch is empty and can't refer to any of them yet). + */ + if (!has_space_for_bookend_snapshots(brw, 2)) + wrap_bookend_bo(brw); + + DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots); + emit_bookend_snapshot(brw); +} + +/** + * Called at the end of every render ring batch. + * + * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters. + * + * This relies on there being enough space in BATCH_RESERVED. + */ +void +brw_perf_monitor_finish_batch(struct brw_context *brw) +{ + assert(brw->batch.ring == RENDER_RING); + + if (brw->perfmon.oa_users == 0) + return; + + DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots); + + /* Not safe to wrap; should've reserved space already. */ + assert(has_space_for_bookend_snapshots(brw, 1)); + + emit_bookend_snapshot(brw); + + stop_oa_counters(brw); +} + +/******************************************************************************/ + void brw_init_performance_monitors(struct brw_context *brw) { @@ -878,4 +1463,9 @@ brw_init_performance_monitors(struct brw_context *brw) brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout); brw->perfmon.statistics_registers = gen7_statistics_register_addresses; } + + brw->perfmon.unresolved = + ralloc_array(brw, struct brw_perf_monitor_object *, 1); + brw->perfmon.unresolved_elements = 0; + brw->perfmon.unresolved_array_size = 1; }