#include <limits.h>
-#include "main/bitset.h"
+#include "util/bitset.h"
#include "main/hash.h"
#include "main/macros.h"
#include "main/mtypes.h"
#include "main/performance_monitor.h"
-#include "glsl/ralloc.h"
+#include "util/ralloc.h"
#include "brw_context.h"
#include "brw_defines.h"
*/
drm_intel_bo *oa_bo;
+ /** Indexes into bookend_bo (snapshot numbers) for various segments. */
+ int oa_head_end;
+ int oa_middle_start;
+ int oa_tail_start;
+
+ /**
+ * Storage for OA results accumulated so far.
+ *
+ * An array indexed by the counter ID in the OA_COUNTERS group.
+ *
+ * When we run out of space in bookend_bo, we compute the results so far
+ * and add them to the value stored here. Then, we can discard bookend_bo.
+ */
+ uint32_t *oa_results;
+
/**
* BO containing starting and ending snapshots for any active pipeline
* statistics counters.
* documentation, but is available by reading the source code for the
* intel_perf_counters utility (shipped as part of intel-gpu-tools).
*/
-const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
+static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
COUNTER("cycles the CS unit is starved"),
COUNTER("cycles the CS unit is stalled"),
COUNTER("cycles the VF unit is starved"),
COUNTER("cycles any EU is stalled for math"),
};
-const static int gen5_oa_snapshot_layout[] =
+static const int gen5_oa_snapshot_layout[] =
{
-1, /* Report ID */
-1, /* TIMESTAMP (64-bit) */
28, /* cycles any EU is stalled for math */
};
-const static struct gl_perf_monitor_group gen5_groups[] = {
+static const struct gl_perf_monitor_group gen5_groups[] = {
[OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
/* Our pipeline statistics counter handling requires hardware contexts. */
};
/**
* Aggregating counters A0-A28:
*/
-const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
+static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
/* A0: 0 */ COUNTER("Aggregated Core Array Active"),
/* A1: 1 */ COUNTER("Aggregated Core Array Stalled"),
/* A2: 2 */ COUNTER("Vertex Shader Active Time"),
*
* (Yes, this is a strange order.) We also have to remap for missing counters.
*/
-const static int gen6_oa_snapshot_layout[] =
+static const int gen6_oa_snapshot_layout[] =
{
-1, /* Report ID */
-1, /* TIMESTAMP (64-bit) */
18, /* A21: Pixel Kill Count */
};
-const static struct gl_perf_monitor_counter gen6_statistics_counters[] = {
+static const struct gl_perf_monitor_counter gen6_statistics_counters[] = {
COUNTER64("IA_VERTICES_COUNT"),
COUNTER64("IA_PRIMITIVES_COUNT"),
COUNTER64("VS_INVOCATION_COUNT"),
};
/** MMIO register addresses for each pipeline statistics counter. */
-const static int gen6_statistics_register_addresses[] = {
+static const int gen6_statistics_register_addresses[] = {
IA_VERTICES_COUNT,
IA_PRIMITIVES_COUNT,
VS_INVOCATION_COUNT,
GEN6_SO_PRIM_STORAGE_NEEDED,
};
-const static struct gl_perf_monitor_group gen6_groups[] = {
+static const struct gl_perf_monitor_group gen6_groups[] = {
GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
};
* Ivybridge/Baytrail/Haswell:
* @{
*/
-const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
+static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
COUNTER("Aggregated Core Array Active"),
COUNTER("Aggregated Core Array Stalled"),
COUNTER("Vertex Shader Active Time"),
* B7 B6 B5 B4 B3 B2 B1 B0
* Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv
*/
-const static int gen7_oa_snapshot_layout[] =
+static const int gen7_oa_snapshot_layout[] =
{
-1, /* Report ID */
-1, /* TIMESTAMP (64-bit) */
-1, /* Reserved */
};
-const static struct gl_perf_monitor_counter gen7_statistics_counters[] = {
+static const struct gl_perf_monitor_counter gen7_statistics_counters[] = {
COUNTER64("IA_VERTICES_COUNT"),
COUNTER64("IA_PRIMITIVES_COUNT"),
COUNTER64("VS_INVOCATION_COUNT"),
};
/** MMIO register addresses for each pipeline statistics counter. */
-const static int gen7_statistics_register_addresses[] = {
+static const int gen7_statistics_register_addresses[] = {
IA_VERTICES_COUNT,
IA_PRIMITIVES_COUNT,
VS_INVOCATION_COUNT,
GEN7_SO_PRIM_STORAGE_NEEDED(3),
};
-const static struct gl_perf_monitor_group gen7_groups[] = {
+static const struct gl_perf_monitor_group gen7_groups[] = {
GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
};
static void
dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
{
+ struct brw_context *brw = brw_void;
struct gl_context *ctx = brw_void;
struct gl_perf_monitor_object *m = monitor_void;
struct brw_perf_monitor_object *monitor = monitor_void;
- DBG("%4d %-7s %-6s %-11s %-9s\n",
+ const char *resolved = "";
+ for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
+ if (brw->perfmon.unresolved[i] == monitor) {
+ resolved = "Unresolved";
+ break;
+ }
+ }
+
+ DBG("%4d %-7s %-6s %-10s %-11s <%3d, %3d, %3d> %-6s %-9s\n",
name,
m->Active ? "Active" : "",
m->Ended ? "Ended" : "",
+ resolved,
brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
+ monitor->oa_head_end,
+ monitor->oa_middle_start,
+ monitor->oa_tail_start,
+ monitor->oa_bo ? "OA BO" : "",
monitor->pipeline_stats_bo ? "Stats BO" : "");
}
const int group = PIPELINE_STATS_COUNTERS;
const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
- intel_batchbuffer_emit_mi_flush(brw);
+ brw_emit_mi_flush(brw);
for (int i = 0; i < num_counters; i++) {
if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
+ if (monitor->pipeline_stats_results == NULL) {
+ _mesa_error_no_memory(__func__);
+ return;
+ }
drm_intel_bo_map(monitor->pipeline_stats_bo, false);
uint64_t *start = monitor->pipeline_stats_bo->virtual;
/* Pick the counter format which gives us all the counters. */
switch (brw->gen) {
+ case 5:
+ return; /* Ironlake counters are always running. */
case 6:
- counter_format = 1; /* 0b001 */
+ counter_format = 0b001;
break;
case 7:
- counter_format = 5; /* 0b101 */
+ counter_format = 0b101;
break;
default:
- assert(!"Tried to enable OA counters on an unsupported generation.");
- return;
+ unreachable("Tried to enable OA counters on an unsupported generation.");
}
BEGIN_BATCH(3);
static void
stop_oa_counters(struct brw_context *brw)
{
+ /* Ironlake counters never stop. */
+ if (brw->gen == 5)
+ return;
+
BEGIN_BATCH(3);
OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
OUT_BATCH(OACONTROL);
* The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
* including the required PIPE_CONTROL flushes.
*
- * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
+ * Sandybridge is the worst case scenario: brw_emit_mi_flush
* expands to three PIPE_CONTROLs which are 4 DWords each. We have to flush
* before and after MI_REPORT_PERF_COUNT, so multiply by two. Finally, add
* the 3 DWords for MI_REPORT_PERF_COUNT itself.
/* Make sure the commands to take a snapshot fits in a single batch. */
intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
RENDER_RING);
- int batch_used = brw->batch.used;
+ int batch_used = USED_BATCH(brw->batch);
/* Reports apparently don't always get written unless we flush first. */
- intel_batchbuffer_emit_mi_flush(brw);
+ brw_emit_mi_flush(brw);
if (brw->gen == 5) {
/* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
OUT_BATCH(report_id);
ADVANCE_BATCH();
} else {
- assert(!"Unsupported generation for performance counters.");
+ unreachable("Unsupported generation for performance counters.");
}
/* Reports apparently don't always get written unless we flush after. */
- intel_batchbuffer_emit_mi_flush(brw);
+ brw_emit_mi_flush(brw);
(void) batch_used;
- assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+ assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+}
+
+/**
+ * Add a monitor to the global list of "unresolved monitors."
+ *
+ * Monitors are "unresolved" if they refer to OA counter snapshots in
+ * bookend_bo. Results (even partial ones) must be gathered for all
+ * unresolved monitors before it's safe to discard bookend_bo.
+ */
+static void
+add_to_unresolved_monitor_list(struct brw_context *brw,
+ struct brw_perf_monitor_object *monitor)
+{
+ if (brw->perfmon.unresolved_elements >=
+ brw->perfmon.unresolved_array_size) {
+ brw->perfmon.unresolved_array_size *= 2;
+ brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved,
+ struct brw_perf_monitor_object *,
+ brw->perfmon.unresolved_array_size);
+ }
+
+ brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor;
+}
+
+/**
+ * If possible, throw away the contents of bookend BO.
+ *
+ * When all monitoring stops, and no monitors need data from bookend_bo to
+ * compute results, we can discard it and start writing snapshots at the
+ * beginning again. This helps reduce the amount of buffer wraparound.
+ */
+static void
+clean_bookend_bo(struct brw_context *brw)
+{
+ if (brw->perfmon.unresolved_elements == 0) {
+ DBG("***Resetting bookend snapshots to 0\n");
+ brw->perfmon.bookend_snapshots = 0;
+ }
+}
+
+/**
+ * Remove a monitor from the global list of "unresolved monitors."
+ *
+ * This can happen when:
+ * - We finish computing a completed monitor's results.
+ * - We discard unwanted monitor results.
+ * - A monitor's results can be computed without relying on bookend_bo.
+ */
+static void
+drop_from_unresolved_monitor_list(struct brw_context *brw,
+ struct brw_perf_monitor_object *monitor)
+{
+ for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
+ if (brw->perfmon.unresolved[i] == monitor) {
+ int last_elt = --brw->perfmon.unresolved_elements;
+
+ if (i == last_elt) {
+ brw->perfmon.unresolved[i] = NULL;
+ } else {
+ brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt];
+ }
+
+ clean_bookend_bo(brw);
+ return;
+ }
+ }
+}
+
+/**
+ * Given pointers to starting and ending OA snapshots, add the deltas for each
+ * counter to the results.
+ */
+static void
+add_deltas(struct brw_context *brw,
+ struct brw_perf_monitor_object *monitor,
+ uint32_t *start, uint32_t *end)
+{
+ /* Look for expected report ID values to ensure data is present. */
+ assert(start[0] == REPORT_ID);
+ assert(end[0] == REPORT_ID);
+
+ /* Subtract each counter's ending and starting values, then add the
+ * difference to the counter's value so far.
+ */
+ for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) {
+ /* When debugging, it's useful to note when the ending value is less than
+ * the starting value; aggregating counters should always increase in
+ * value (or remain unchanged). This happens periodically due to
+ * wraparound, but can also indicate serious problems.
+ */
+#ifdef DEBUG
+ if (end[i] < start[i]) {
+ int counter = brw->perfmon.oa_snapshot_layout[i];
+ if (counter >= 0) {
+ DBG("WARNING: \"%s\" ending value was less than the starting "
+ "value: %u < %u (end - start = %u)\n",
+ brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name,
+ end[i], start[i], end[i] - start[i]);
+ }
+ }
+#endif
+ monitor->oa_results[i] += end[i] - start[i];
+ }
+}
+
+/**
+ * Gather OA counter results (partial or full) from a series of snapshots.
+ *
+ * Monitoring can start or stop at any time, likely at some point mid-batch.
+ * We write snapshots for both events, storing them in monitor->oa_bo.
+ *
+ * Ideally, we would simply subtract those two snapshots to obtain the final
+ * counter results. Unfortunately, our hardware doesn't preserve their values
+ * across context switches or GPU sleep states. In order to support multiple
+ * concurrent OA clients, as well as reliable data across power management,
+ * we have to take snapshots at the start and end of batches as well.
+ *
+ * This results in a three-part sequence of (start, end) intervals:
+ * - The "head" is from the BeginPerfMonitor snapshot to the end of the first
+ * batchbuffer.
+ * - The "middle" is a series of (batch start, batch end) snapshots which
+ * bookend any batchbuffers between the ones which start/end monitoring.
+ * - The "tail" is from the start of the last batch where monitoring was
+ * active to the EndPerfMonitor snapshot.
+ *
+ * Due to wrapping in the bookend BO, we may have to accumulate partial results.
+ * If so, we handle the "head" and any "middle" results so far. When monitoring
+ * eventually ends, we handle additional "middle" batches and the "tail."
+ */
+static void
+gather_oa_results(struct brw_context *brw,
+ struct brw_perf_monitor_object *monitor,
+ uint32_t *bookend_buffer)
+{
+ struct gl_perf_monitor_object *m = &monitor->base;
+ assert(monitor->oa_bo != NULL);
+
+ drm_intel_bo_map(monitor->oa_bo, false);
+ uint32_t *monitor_buffer = monitor->oa_bo->virtual;
+
+ /* If monitoring was entirely contained within a single batch, then the
+ * bookend BO is irrelevant. Just subtract monitor->bo's two snapshots.
+ */
+ if (monitor->oa_middle_start == -1) {
+ add_deltas(brw, monitor,
+ monitor_buffer,
+ monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
+ sizeof(uint32_t)));
+ drm_intel_bo_unmap(monitor->oa_bo);
+ return;
+ }
+
+ const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot;
+
+ /* First, add the contributions from the "head" interval:
+ * (snapshot taken at BeginPerfMonitor time,
+ * snapshot taken at the end of the first batch after monitoring began)
+ */
+ if (monitor->oa_head_end != -1) {
+ assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots);
+ add_deltas(brw, monitor,
+ monitor_buffer,
+ bookend_buffer + snapshot_size * monitor->oa_head_end);
+
+ /* Make sure we don't count the "head" again in the future. */
+ monitor->oa_head_end = -1;
+ }
+
+ /* Next, count the contributions from the "middle" batches. These are
+ * (batch begin, batch end) deltas while monitoring was active.
+ */
+ int last_snapshot;
+ if (m->Ended)
+ last_snapshot = monitor->oa_tail_start;
+ else
+ last_snapshot = brw->perfmon.bookend_snapshots;
+
+ for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) {
+ add_deltas(brw, monitor,
+ bookend_buffer + snapshot_size * s,
+ bookend_buffer + snapshot_size * (s + 1));
+ }
+
+ /* Finally, if the monitor has ended, we need to count the contributions of
+ * the "tail" interval:
+ * (start of the batch where monitoring ended, EndPerfMonitor snapshot)
+ */
+ if (m->Ended) {
+ assert(monitor->oa_tail_start != -1);
+ add_deltas(brw, monitor,
+ bookend_buffer + snapshot_size * monitor->oa_tail_start,
+ monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
+ sizeof(uint32_t)));
+ }
+
+ drm_intel_bo_unmap(monitor->oa_bo);
+
+ /* If the monitor has ended, then we've gathered all the results, and
+ * can free the monitor's OA BO.
+ */
+ if (m->Ended) {
+ drm_intel_bo_unreference(monitor->oa_bo);
+ monitor->oa_bo = NULL;
+
+ /* The monitor's OA result is now resolved. */
+ DBG("Marking %d resolved - results gathered\n", m->Name);
+ drop_from_unresolved_monitor_list(brw, monitor);
+ }
+}
+
+/**
+ * Handle running out of space in the bookend BO.
+ *
+ * When we run out of space in the bookend BO, we need to gather up partial
+ * results for every unresolved monitor. This allows us to free the snapshot
+ * data in bookend_bo, freeing up the space for reuse. We call this "wrapping."
+ *
+ * This will completely compute the result for any unresolved monitors that
+ * have ended.
+ */
+static void
+wrap_bookend_bo(struct brw_context *brw)
+{
+ DBG("****Wrap bookend BO****\n");
+ /* Note that wrapping will only occur at the start of a batch, since that's
+ * where we reserve space. So the current batch won't reference bookend_bo
+ * or any monitor BOs. This means we don't need to worry about
+ * synchronization.
+ *
+ * Also, EndPerfMonitor guarantees that only monitors which span multiple
+ * batches exist in the unresolved monitor list.
+ */
+ assert(brw->perfmon.oa_users > 0);
+
+ drm_intel_bo_map(brw->perfmon.bookend_bo, false);
+ uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
+ for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
+ struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
+ struct gl_perf_monitor_object *m = &monitor->base;
+
+ gather_oa_results(brw, monitor, bookend_buffer);
+
+ if (m->Ended) {
+ /* gather_oa_results() dropped the monitor from the unresolved list,
+ * throwing our indices off by one.
+ */
+ --i;
+ } else {
+ /* When we create the new bookend_bo, snapshot #0 will be the
+ * beginning of another "middle" BO.
+ */
+ monitor->oa_middle_start = 0;
+ assert(monitor->oa_head_end == -1);
+ assert(monitor->oa_tail_start == -1);
+ }
+ }
+ drm_intel_bo_unmap(brw->perfmon.bookend_bo);
+
+ brw->perfmon.bookend_snapshots = 0;
+}
+
+/* This is fairly arbitrary; the trade off is memory usage vs. extra overhead
+ * from wrapping. On Gen7, 32768 should be enough for for 128 snapshots before
+ * wrapping (since each is 256 bytes).
+ */
+#define BOOKEND_BO_SIZE_BYTES 32768
+
+/**
+ * Check whether bookend_bo has space for a given number of snapshots.
+ */
+static bool
+has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots)
+{
+ int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
+
+ /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */
+ int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots;
+
+ return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES;
+}
+
+/**
+ * Write an OA counter snapshot to bookend_bo.
+ */
+static void
+emit_bookend_snapshot(struct brw_context *brw)
+{
+ int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
+ int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes;
+
+ emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes,
+ REPORT_ID);
+ ++brw->perfmon.bookend_snapshots;
}
/******************************************************************************/
monitor->oa_bo = NULL;
}
+ /* Since the results are now invalid, we don't need to hold on to any
+ * snapshots in bookend_bo. The monitor is effectively "resolved."
+ */
+ drop_from_unresolved_monitor_list(brw, monitor);
+
+ monitor->oa_head_end = -1;
+ monitor->oa_middle_start = -1;
+ monitor->oa_tail_start = -1;
+
+ free(monitor->oa_results);
+ monitor->oa_results = NULL;
+
if (monitor->pipeline_stats_bo) {
drm_intel_bo_unreference(monitor->pipeline_stats_bo);
monitor->pipeline_stats_bo = NULL;
reinitialize_perf_monitor(brw, monitor);
if (monitor_needs_oa(brw, m)) {
+ /* If the global OA bookend BO doesn't exist, allocate it. This should
+ * only happen once, but we delay until BeginPerfMonitor time to avoid
+ * wasting memory for contexts that don't use performance monitors.
+ */
+ if (!brw->perfmon.bookend_bo) {
+ brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
+ "OA bookend BO",
+ BOOKEND_BO_SIZE_BYTES, 64);
+ }
+
monitor->oa_bo =
drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
#ifdef DEBUG
drm_intel_bo_unmap(monitor->oa_bo);
#endif
+ /* Allocate storage for accumulated OA counter values. */
+ monitor->oa_results =
+ calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
+
+ /* If the OA counters aren't already on, enable them. */
+ if (brw->perfmon.oa_users == 0) {
+ /* Ensure the OACONTROL enable and snapshot land in the same batch. */
+ int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
+ intel_batchbuffer_require_space(brw, space, RENDER_RING);
+ start_oa_counters(brw);
+ }
+
/* Take a starting OA counter snapshot. */
emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
+ monitor->oa_head_end = brw->perfmon.bookend_snapshots;
+ monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
+ monitor->oa_tail_start = -1;
+
+ /* Add the monitor to the unresolved list. */
+ add_to_unresolved_monitor_list(brw, monitor);
+
++brw->perfmon.oa_users;
}
SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID);
--brw->perfmon.oa_users;
+
+ if (brw->perfmon.oa_users == 0)
+ stop_oa_counters(brw);
+
+ if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) {
+ assert(monitor->oa_head_end != -1);
+ /* We never actually wrote the snapshot for the end of the first batch
+ * after BeginPerfMonitor. This means that monitoring was contained
+ * entirely within a single batch, so we can ignore bookend_bo and
+ * just compare the monitor's begin/end snapshots directly.
+ */
+ monitor->oa_head_end = -1;
+ monitor->oa_middle_start = -1;
+ monitor->oa_tail_start = -1;
+
+ /* We can also mark it resolved since it won't depend on bookend_bo. */
+ DBG("Marking %d resolved - entirely in one batch\n", m->Name);
+ drop_from_unresolved_monitor_list(brw, monitor);
+ } else {
+ /* We've written at least one batch end snapshot, so the monitoring
+ * spanned multiple batches. Mark which snapshot corresponds to the
+ * start of the current batch.
+ */
+ monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1;
+ }
}
if (monitor_needs_statistics_registers(brw, m)) {
{
struct brw_context *brw = brw_context(ctx);
struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+ const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size);
DBG("GetResult(%d)\n", m->Name);
brw_dump_perf_monitors(brw);
*/
GLsizei offset = 0;
+ if (monitor_needs_oa(brw, m)) {
+ /* Gather up the results from the BO, unless we already did due to the
+ * bookend BO wrapping.
+ */
+ if (monitor->oa_bo) {
+ /* Since the result is available, all the necessary snapshots will
+ * have been written to the bookend BO. If other monitors are
+ * active, the bookend BO may be busy or referenced by the current
+ * batch, but only for writing snapshots beyond oa_tail_start,
+ * which we don't care about.
+ *
+ * Using an unsynchronized mapping avoids stalling for an
+ * indeterminate amount of time.
+ */
+ drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
+
+ gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
+
+ drm_intel_bo_unmap(brw->perfmon.bookend_bo);
+ }
+
+ for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
+ int group = OA_COUNTERS;
+ int counter = brw->perfmon.oa_snapshot_layout[i];
+
+ /* We always capture all the OA counters, but the application may
+ * have only asked for a subset. Skip unwanted counters.
+ */
+ if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
+ continue;
+
+ if (data + offset + 3 <= data_end) {
+ data[offset++] = group;
+ data[offset++] = counter;
+ data[offset++] = monitor->oa_results[i];
+ }
+ }
+
+ clean_bookend_bo(brw);
+ }
+
if (monitor_needs_statistics_registers(brw, m)) {
const int num_counters =
ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
- if (!monitor->pipeline_stats_results)
+ if (!monitor->pipeline_stats_results) {
gather_statistics_results(brw, monitor);
+ /* Check if we did really get the results */
+ if (!monitor->pipeline_stats_results) {
+ if (bytes_written) {
+ *bytes_written = 0;
+ }
+ return;
+ }
+ }
+
for (int i = 0; i < num_counters; i++) {
if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
- data[offset++] = PIPELINE_STATS_COUNTERS;
- data[offset++] = i;
- *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
- offset += 2;
+ if (data + offset + 4 <= data_end) {
+ data[offset++] = PIPELINE_STATS_COUNTERS;
+ data[offset++] = i;
+ *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
+ offset += 2;
+ }
}
}
}
static struct gl_perf_monitor_object *
brw_new_perf_monitor(struct gl_context *ctx)
{
+ (void) ctx;
return calloc(1, sizeof(struct brw_perf_monitor_object));
}
/******************************************************************************/
+/**
+ * Called at the start of every render ring batch.
+ *
+ * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot.
+ * Since it's a new batch, there will be plenty of space for the commands.
+ */
+void
+brw_perf_monitor_new_batch(struct brw_context *brw)
+{
+ assert(brw->batch.ring == RENDER_RING);
+ assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
+
+ if (brw->perfmon.oa_users == 0)
+ return;
+
+ start_oa_counters(brw);
+
+ /* Make sure bookend_bo has enough space for a pair of snapshots.
+ * If not, "wrap" the BO: gather up any results so far, and start from
+ * the beginning of the buffer. Reserving a pair guarantees that wrapping
+ * will only happen at the beginning of a batch, where it's safe to map BOs
+ * (as the batch is empty and can't refer to any of them yet).
+ */
+ if (!has_space_for_bookend_snapshots(brw, 2))
+ wrap_bookend_bo(brw);
+
+ DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
+ emit_bookend_snapshot(brw);
+}
+
+/**
+ * Called at the end of every render ring batch.
+ *
+ * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters.
+ *
+ * This relies on there being enough space in BATCH_RESERVED.
+ */
+void
+brw_perf_monitor_finish_batch(struct brw_context *brw)
+{
+ assert(brw->batch.ring == RENDER_RING);
+
+ if (brw->perfmon.oa_users == 0)
+ return;
+
+ DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
+
+ /* Not safe to wrap; should've reserved space already. */
+ assert(has_space_for_bookend_snapshots(brw, 1));
+
+ emit_bookend_snapshot(brw);
+
+ stop_oa_counters(brw);
+}
+
+/******************************************************************************/
+
void
brw_init_performance_monitors(struct brw_context *brw)
{
brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
}
+
+ brw->perfmon.unresolved =
+ ralloc_array(brw, struct brw_perf_monitor_object *, 1);
+ brw->perfmon.unresolved_elements = 0;
+ brw->perfmon.unresolved_array_size = 1;
}