From 0f2da773070c06b6d20ad264d3abb19c4dfd9761 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 11 Apr 2013 13:22:29 -0700 Subject: [PATCH] i965: Add support for GL_AMD_performance_monitor on Ironlake. Ironlake's counters are always enabled; userspace can simply send a MI_REPORT_PERF_COUNT packet to take a snapshot of them. This makes it easy to implement. The counters are documented in the source code for the intel-gpu-tools intel_perf_counters utility. v2: Adjust for core data structure changes. Add a table mapping buffer object offsets to exposed counters (which changes each generation). Finally, add report ID assertions to sanity check the BO layout (thanks to Carl Worth). v3: Update for core BeginPerfMonitor hook changes (requested by Brian). Signed-off-by: Kenneth Graunke Reviewed-by: Ian Romanick --- src/mesa/drivers/dri/i965/Makefile.sources | 1 + src/mesa/drivers/dri/i965/brw_context.c | 4 + src/mesa/drivers/dri/i965/brw_context.h | 14 + src/mesa/drivers/dri/i965/brw_defines.h | 7 + .../dri/i965/brw_performance_monitor.c | 391 ++++++++++++++++++ src/mesa/drivers/dri/i965/intel_extensions.c | 3 + 6 files changed, 420 insertions(+) create mode 100644 src/mesa/drivers/dri/i965/brw_performance_monitor.c diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index f521daaf719..e2193164382 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -69,6 +69,7 @@ i965_FILES = \ brw_lower_texture_gradients.cpp \ brw_misc_state.c \ brw_object_purgeable.c \ + brw_performance_monitor.c \ brw_program.c \ brw_primitive_restart.c \ brw_queryobj.c \ diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index f60d4df5d32..5f58a291d8d 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -503,6 +503,10 @@ brwCreateContext(int api, _mesa_initialize_dispatch_tables(ctx); _mesa_initialize_vbo_vtxfmt(ctx); + if (ctx->Extensions.AMD_performance_monitor) { + brw_init_performance_monitors(brw); + } + return true; } diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 656fb3c0504..0f88bad2475 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -128,6 +128,7 @@ struct brw_vs_prog_key; struct brw_vec4_prog_key; struct brw_wm_prog_key; struct brw_wm_prog_data; +struct brw_perf_bo_layout; enum brw_state_id { BRW_STATE_URB_FENCE, @@ -1313,6 +1314,16 @@ struct brw_context bool begin_emitted; } query; + struct { + /* A map describing which counters are stored at a particular 32-bit + * offset in the buffer object. + */ + const struct brw_perf_bo_layout *bo_layout; + + /* Number of 32-bit entries in the buffer object. */ + int entries_in_bo; + } perfmon; + int num_atoms; const struct brw_tracked_state **atoms; @@ -1485,6 +1496,9 @@ bool brw_is_hiz_depth_format(struct brw_context *ctx, gl_format format); bool brw_render_target_supported(struct brw_context *brw, struct gl_renderbuffer *rb); +/* brw_performance_monitor.c */ +void brw_init_performance_monitors(struct brw_context *brw); + /* gen6_sol.c */ void brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode, diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index e9e0c4a0e3a..e2838059050 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1817,6 +1817,13 @@ enum brw_wm_barycentric_interp_mode { #define CMD_MI_FLUSH 0x0200 +#define GEN5_MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2)) +/* DW0 */ +# define GEN5_MI_COUNTER_SET_0 (0 << 6) +# define GEN5_MI_COUNTER_SET_1 (1 << 6) +/* DW1 */ +# define MI_COUNTER_ADDRESS_GTT (1 << 0) +/* DW2: a user-defined report ID (written to the buffer but can be anything) */ /* Bitfields for the URB_WRITE message, DW2 of message header: */ #define URB_WRITE_PRIM_END 0x1 diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c new file mode 100644 index 00000000000..87c4a633bfd --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -0,0 +1,391 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_performance_monitor.c + * + * Implementation of the GL_AMD_performance_monitor extension. + * + * Currently only for Ironlake. + */ + +#include + +#include "main/bitset.h" +#include "main/macros.h" +#include "main/mtypes.h" +#include "main/performance_monitor.h" + +#include "brw_context.h" +#include "brw_defines.h" +#include "intel_batchbuffer.h" + +/** + * i965 representation of a performance monitor object. + */ +struct brw_perf_monitor_object +{ + /** The base class. */ + struct gl_perf_monitor_object base; + + /** + * BO containing raw counter data in a hardware specific form. + */ + drm_intel_bo *bo; +}; + +/** Downcasting convenience macro. */ +static inline struct brw_perf_monitor_object * +brw_perf_monitor(struct gl_perf_monitor_object *m) +{ + return (struct brw_perf_monitor_object *) m; +} + +#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048 + +/* Two random values used to ensure we're getting valid snapshots. */ +#define FIRST_SNAPSHOT_REPORT_ID 0xd2e9c607 +#define SECOND_SNAPSHOT_REPORT_ID 0xad584b1d + +/******************************************************************************/ + +#define COUNTER(name) \ + { \ + .Name = name, \ + .Type = GL_UNSIGNED_INT, \ + .Minimum = { .u32 = 0 }, \ + .Maximum = { .u32 = ~0 }, \ + } + +#define GROUP(name, max_active, counter_list) \ + { \ + .Name = name, \ + .MaxActiveCounters = max_active, \ + .Counters = counter_list, \ + .NumCounters = ARRAY_SIZE(counter_list), \ + } + +struct brw_perf_bo_layout { + int group; + int counter; +}; + +/** + * Ironlake: + * @{ + */ +const static struct gl_perf_monitor_counter gen5_raw_aggregating_counters[] = { + COUNTER("cycles the CS unit is starved"), + COUNTER("cycles the CS unit is stalled"), + COUNTER("cycles the VF unit is starved"), + COUNTER("cycles the VF unit is stalled"), + COUNTER("cycles the VS unit is starved"), + COUNTER("cycles the VS unit is stalled"), + COUNTER("cycles the GS unit is starved"), + COUNTER("cycles the GS unit is stalled"), + COUNTER("cycles the CL unit is starved"), + COUNTER("cycles the CL unit is stalled"), + COUNTER("cycles the SF unit is starved"), + COUNTER("cycles the SF unit is stalled"), + COUNTER("cycles the WZ unit is starved"), + COUNTER("cycles the WZ unit is stalled"), + COUNTER("Z buffer read/write"), + COUNTER("cycles each EU was active"), + COUNTER("cycles each EU was suspended"), + COUNTER("cycles threads loaded all EUs"), + COUNTER("cycles filtering active"), + COUNTER("cycles PS threads executed"), + COUNTER("subspans written to RC"), + COUNTER("bytes read for texture reads"), + COUNTER("texels returned from sampler"), + COUNTER("polygons not culled"), + COUNTER("clocks MASF has valid message"), + COUNTER("64b writes/reads from RC"), + COUNTER("reads on dataport"), + COUNTER("clocks MASF has valid msg not consumed by sampler"), + COUNTER("cycles any EU is stalled for math"), +}; + +const static struct gl_perf_monitor_group gen5_groups[] = { + GROUP("Aggregating Counters", INT_MAX, gen5_raw_aggregating_counters), +}; + +const static struct brw_perf_bo_layout gen5_perf_bo_layout[] = +{ + { -1, -1, }, /* Report ID */ + { -1, -1, }, /* TIMESTAMP (64-bit) */ + { -1, -1, }, /* ...second half... */ + { 0, 0, }, /* cycles the CS unit is starved */ + { 0, 1, }, /* cycles the CS unit is stalled */ + { 0, 2, }, /* cycles the VF unit is starved */ + { 0, 3, }, /* cycles the VF unit is stalled */ + { 0, 4, }, /* cycles the VS unit is starved */ + { 0, 5, }, /* cycles the VS unit is stalled */ + { 0, 6, }, /* cycles the GS unit is starved */ + { 0, 7, }, /* cycles the GS unit is stalled */ + { 0, 8, }, /* cycles the CL unit is starved */ + { 0, 9, }, /* cycles the CL unit is stalled */ + { 0, 10, }, /* cycles the SF unit is starved */ + { 0, 11, }, /* cycles the SF unit is stalled */ + { 0, 12, }, /* cycles the WZ unit is starved */ + { 0, 13, }, /* cycles the WZ unit is stalled */ + { 0, 14, }, /* Z buffer read/write */ + { 0, 15, }, /* cycles each EU was active */ + { 0, 16, }, /* cycles each EU was suspended */ + { 0, 17, }, /* cycles threads loaded all EUs */ + { 0, 18, }, /* cycles filtering active */ + { 0, 19, }, /* cycles PS threads executed */ + { 0, 20, }, /* subspans written to RC */ + { 0, 21, }, /* bytes read for texture reads */ + { 0, 22, }, /* texels returned from sampler */ + { 0, 23, }, /* polygons not culled */ + { 0, 24, }, /* clocks MASF has valid message */ + { 0, 25, }, /* 64b writes/reads from RC */ + { 0, 26, }, /* reads on dataport */ + { 0, 27, }, /* clocks MASF has valid msg not consumed by sampler */ + { 0, 28, }, /* cycles any EU is stalled for math */ +}; + +/** @} */ + +/******************************************************************************/ + +static void +snapshot_aggregating_counters(struct brw_context *brw, + drm_intel_bo *bo, uint32_t offset_in_bytes) +{ + uint32_t report_id = offset_in_bytes == 0 ? FIRST_SNAPSHOT_REPORT_ID + : SECOND_SNAPSHOT_REPORT_ID; + + if (brw->gen == 5) { + /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all + * the counters. The report ID is ignored in the second set. + */ + BEGIN_BATCH(6); + OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0); + OUT_RELOC(bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset_in_bytes); + OUT_BATCH(report_id); + + OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1); + OUT_RELOC(bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset_in_bytes + 64); + OUT_BATCH(report_id); + ADVANCE_BATCH(); + } else { + assert(!"Unsupported generation for performance counters."); + } +} + +static bool +aggregating_counters_needed(struct brw_context *brw, + struct gl_perf_monitor_object *m) +{ + return m->ActiveGroups[0]; +} + +/******************************************************************************/ + +/** + * Create a new performance monitor object. + */ +static struct gl_perf_monitor_object * +brw_new_perf_monitor(struct gl_context *ctx) +{ + return calloc(1, sizeof(struct brw_perf_monitor_object)); +} + +/** + * Delete a performance monitor object. + */ +static void +brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m) +{ + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + if (monitor->bo) + drm_intel_bo_unreference(monitor->bo); + + free(monitor); +} + +/** + * Driver hook for glBeginPerformanceMonitorAMD(). + */ +static GLboolean +brw_begin_perf_monitor(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + /* If the BO already exists, throw it away. It contains old results + * that we're not interested in any more. + */ + if (monitor->bo) + drm_intel_bo_unreference(monitor->bo); + + /* Create a new BO. */ + monitor->bo = + drm_intel_bo_alloc(brw->bufmgr, "performance monitor", 4096, 64); + drm_intel_bo_map(monitor->bo, true); + memset((char *) monitor->bo->virtual, 0xff, 4096); + drm_intel_bo_unmap(monitor->bo); + + /* Take a shapshot of all active counters */ + if (aggregating_counters_needed(brw, m)) { + snapshot_aggregating_counters(brw, monitor->bo, 0); + } + + return true; +} + +/** + * Driver hook for glEndPerformanceMonitorAMD(). + */ +static void +brw_end_perf_monitor(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + if (aggregating_counters_needed(brw, m)) { + snapshot_aggregating_counters(brw, monitor->bo, + SECOND_SNAPSHOT_OFFSET_IN_BYTES); + } +} + +/** + * Reset a performance monitor, throwing away any results. + */ +static void +brw_reset_perf_monitor(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + if (monitor->bo) { + drm_intel_bo_unreference(monitor->bo); + monitor->bo = NULL; + } + + if (m->Active) { + brw_begin_perf_monitor(ctx, m); + } +} + +/** + * Is a performance monitor result available? + */ +static GLboolean +brw_is_perf_monitor_result_available(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + return !m->Active && monitor->bo && + !drm_intel_bo_references(brw->batch.bo, monitor->bo) && + !drm_intel_bo_busy(monitor->bo); +} + +/** + * Get the performance monitor result. + */ +static void +brw_get_perf_monitor_result(struct gl_context *ctx, + struct gl_perf_monitor_object *m, + GLsizei data_size, + GLuint *data, + GLint *bytes_written) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + /* This hook should only be called when results are available. */ + assert(monitor->bo != NULL); + + drm_intel_bo_map(monitor->bo, false); + unsigned *gpu_bo = monitor->bo->virtual; + + /* Copy data from the BO to the supplied array. + * + * The output data format is: for each + * active counter. The API allows counters to appear in any order. + */ + GLsizei offset = 0; + + /* Look for expected report ID values to ensure data is present. */ + assert(gpu_bo[0] == FIRST_SNAPSHOT_REPORT_ID); + assert(gpu_bo[SECOND_SNAPSHOT_OFFSET_IN_BYTES/4] == SECOND_SNAPSHOT_REPORT_ID); + + for (int i = 0; i < brw->perfmon.entries_in_bo; i++) { + int group = brw->perfmon.bo_layout[i].group; + int counter = brw->perfmon.bo_layout[i].counter; + + if (group < 0 || !BITSET_TEST(m->ActiveCounters[group], counter)) + continue; + + const struct gl_perf_monitor_group *group_obj = + &ctx->PerfMonitor.Groups[group]; + + const struct gl_perf_monitor_counter *c = &group_obj->Counters[counter]; + + data[offset++] = group; + data[offset++] = counter; + + uint32_t second_snapshot_index = + SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint32_t) + i; + + /* Won't work for uint64_t values, but we don't expose any yet. */ + data[offset] = gpu_bo[second_snapshot_index] - gpu_bo[i]; + offset += _mesa_perf_monitor_counter_size(c) / sizeof(uint32_t); + } + + drm_intel_bo_unmap(monitor->bo); + + if (bytes_written) + *bytes_written = offset * sizeof(uint32_t); +} + +void +brw_init_performance_monitors(struct brw_context *brw) +{ + struct gl_context *ctx = &brw->ctx; + + ctx->Driver.NewPerfMonitor = brw_new_perf_monitor; + ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor; + ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor; + ctx->Driver.EndPerfMonitor = brw_end_perf_monitor; + ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor; + ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available; + ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result; + + if (brw->gen == 5) { + ctx->PerfMonitor.Groups = gen5_groups; + ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups); + brw->perfmon.bo_layout = gen5_perf_bo_layout; + brw->perfmon.entries_in_bo = ARRAY_SIZE(gen5_perf_bo_layout); + } +} diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index aef78051e62..0fc5aad23c0 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -160,6 +160,9 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.EXT_shader_integer_mix = ctx->Const.GLSLVersion >= 130; } + if (brw->gen == 5) + ctx->Extensions.AMD_performance_monitor = true; + if (ctx->API == API_OPENGL_CORE) ctx->Extensions.ARB_base_instance = true; if (ctx->API != API_OPENGL_CORE) -- 2.30.2