2 * Copyright © 2013 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 * \file brw_performance_monitor.c
27 * Implementation of the GL_AMD_performance_monitor extension.
29 * On Gen5+ hardware, we have two sources of performance counter data:
30 * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
31 * the Pipeline Statistics Registers. We expose both sets of raw data,
32 * as well as some useful processed values.
34 * The Observability Architecture (OA) counters for Gen6+ are documented
35 * in a separate document from the rest of the PRMs. It is available at:
36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms
37 * => 2013 Intel Core Processor Family => Observability Performance Counters
38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
40 * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public
41 * documentation exists; our implementation is based on the source code for the
42 * intel_perf_counters utility (which is available as part of intel-gpu-tools).
47 #include "main/bitset.h"
48 #include "main/hash.h"
49 #include "main/macros.h"
50 #include "main/mtypes.h"
51 #include "main/performance_monitor.h"
53 #include "glsl/ralloc.h"
55 #include "brw_context.h"
56 #include "brw_defines.h"
57 #include "intel_batchbuffer.h"
59 #define FILE_DEBUG_FLAG DEBUG_PERFMON
62 * i965 representation of a performance monitor object.
64 struct brw_perf_monitor_object
66 /** The base class. */
67 struct gl_perf_monitor_object base
;
70 * BO containing starting and ending snapshots for any active pipeline
71 * statistics counters.
73 drm_intel_bo
*pipeline_stats_bo
;
76 * Storage for final pipeline statistics counter results.
78 uint64_t *pipeline_stats_results
;
81 /** Downcasting convenience macro. */
82 static inline struct brw_perf_monitor_object
*
83 brw_perf_monitor(struct gl_perf_monitor_object
*m
)
85 return (struct brw_perf_monitor_object
*) m
;
88 #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
90 /******************************************************************************/
92 #define COUNTER(name) \
95 .Type = GL_UNSIGNED_INT, \
96 .Minimum = { .u32 = 0 }, \
97 .Maximum = { .u32 = ~0 }, \
100 #define COUNTER64(name) \
103 .Type = GL_UNSIGNED_INT64_AMD, \
104 .Minimum = { .u64 = 0 }, \
105 .Maximum = { .u64 = ~0 }, \
108 #define GROUP(name, max_active, counter_list) \
111 .MaxActiveCounters = max_active, \
112 .Counters = counter_list, \
113 .NumCounters = ARRAY_SIZE(counter_list), \
116 /** Performance Monitor Group IDs */
117 enum brw_counter_groups
{
118 OA_COUNTERS
, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
119 PIPELINE_STATS_COUNTERS
, /* Pipeline Statistics Register Counters */
126 * The list of CHAPS counters unfortunately does not appear in any public
127 * documentation, but is available by reading the source code for the
128 * intel_perf_counters utility (shipped as part of intel-gpu-tools).
130 const static struct gl_perf_monitor_counter gen5_raw_chaps_counters
[] = {
131 COUNTER("cycles the CS unit is starved"),
132 COUNTER("cycles the CS unit is stalled"),
133 COUNTER("cycles the VF unit is starved"),
134 COUNTER("cycles the VF unit is stalled"),
135 COUNTER("cycles the VS unit is starved"),
136 COUNTER("cycles the VS unit is stalled"),
137 COUNTER("cycles the GS unit is starved"),
138 COUNTER("cycles the GS unit is stalled"),
139 COUNTER("cycles the CL unit is starved"),
140 COUNTER("cycles the CL unit is stalled"),
141 COUNTER("cycles the SF unit is starved"),
142 COUNTER("cycles the SF unit is stalled"),
143 COUNTER("cycles the WZ unit is starved"),
144 COUNTER("cycles the WZ unit is stalled"),
145 COUNTER("Z buffer read/write"),
146 COUNTER("cycles each EU was active"),
147 COUNTER("cycles each EU was suspended"),
148 COUNTER("cycles threads loaded all EUs"),
149 COUNTER("cycles filtering active"),
150 COUNTER("cycles PS threads executed"),
151 COUNTER("subspans written to RC"),
152 COUNTER("bytes read for texture reads"),
153 COUNTER("texels returned from sampler"),
154 COUNTER("polygons not culled"),
155 COUNTER("clocks MASF has valid message"),
156 COUNTER("64b writes/reads from RC"),
157 COUNTER("reads on dataport"),
158 COUNTER("clocks MASF has valid msg not consumed by sampler"),
159 COUNTER("cycles any EU is stalled for math"),
162 const static int gen5_oa_snapshot_layout
[] =
165 -1, /* TIMESTAMP (64-bit) */
166 -1, /* ...second half... */
167 0, /* cycles the CS unit is starved */
168 1, /* cycles the CS unit is stalled */
169 2, /* cycles the VF unit is starved */
170 3, /* cycles the VF unit is stalled */
171 4, /* cycles the VS unit is starved */
172 5, /* cycles the VS unit is stalled */
173 6, /* cycles the GS unit is starved */
174 7, /* cycles the GS unit is stalled */
175 8, /* cycles the CL unit is starved */
176 9, /* cycles the CL unit is stalled */
177 10, /* cycles the SF unit is starved */
178 11, /* cycles the SF unit is stalled */
179 12, /* cycles the WZ unit is starved */
180 13, /* cycles the WZ unit is stalled */
181 14, /* Z buffer read/write */
182 15, /* cycles each EU was active */
183 16, /* cycles each EU was suspended */
184 17, /* cycles threads loaded all EUs */
185 18, /* cycles filtering active */
186 19, /* cycles PS threads executed */
187 20, /* subspans written to RC */
188 21, /* bytes read for texture reads */
189 22, /* texels returned from sampler */
190 23, /* polygons not culled */
191 24, /* clocks MASF has valid message */
192 25, /* 64b writes/reads from RC */
193 26, /* reads on dataport */
194 27, /* clocks MASF has valid msg not consumed by sampler */
195 28, /* cycles any EU is stalled for math */
198 const static struct gl_perf_monitor_group gen5_groups
[] = {
199 [OA_COUNTERS
] = GROUP("CHAPS Counters", INT_MAX
, gen5_raw_chaps_counters
),
200 /* Our pipeline statistics counter handling requires hardware contexts. */
208 * A few of the counters here (A17-A20) are not included in the latest
209 * documentation, but are described in the Ironlake PRM (which strangely
210 * documents Sandybridge's performance counter system, not Ironlake's).
211 * It's unclear whether they work or not; empirically, they appear to.
215 * Aggregating counters A0-A28:
217 const static struct gl_perf_monitor_counter gen6_raw_oa_counters
[] = {
218 /* A0: 0 */ COUNTER("Aggregated Core Array Active"),
219 /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"),
220 /* A2: 2 */ COUNTER("Vertex Shader Active Time"),
221 /* A3: Not actually hooked up on Sandybridge. */
222 /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
223 /* A5: 4 */ COUNTER("# VS threads loaded"),
224 /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"),
225 /* A7: 6 */ COUNTER("Geometry Shader Active Time"),
226 /* A8: Not actually hooked up on Sandybridge. */
227 /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
228 /* A10: 8 */ COUNTER("# GS threads loaded"),
229 /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"),
230 /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
231 /* A13: Not actually hooked up on Sandybridge. */
232 /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
233 /* A15: 12 */ COUNTER("# PS threads loaded"),
234 /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
235 /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
236 /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
237 /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
238 /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
239 /* A21: 18 */ COUNTER("Pixel Kill Count"),
240 /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
241 /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
242 /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
243 /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
244 /* A26: 23 */ COUNTER("GPU Busy"),
245 /* A27: 24 */ COUNTER("CL active and not stalled"),
246 /* A28: 25 */ COUNTER("SF active and stalled"),
250 * Sandybridge: Counter Select = 001
251 * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID
252 * A5 A6 A7 A8 A9 A10 A11 A12
253 * A13 A14 A15 A16 A17 A18 A19 A20
254 * A21 A22 A23 A24 A25 A26 A27 A28
256 * (Yes, this is a strange order.) We also have to remap for missing counters.
258 const static int gen6_oa_snapshot_layout
[] =
261 -1, /* TIMESTAMP (64-bit) */
262 -1, /* ...second half... */
263 3, /* A4: Vertex Shader Stall Time - Core Stall */
264 -1, /* A3: (not available) */
265 2, /* A2: Vertex Shader Active Time */
266 1, /* A1: Aggregated Core Array Stalled */
267 0, /* A0: Aggregated Core Array Active */
268 10, /* A12: Pixel Shader Active Time */
269 9, /* A11: Geometry Shader ready but not running Time */
270 8, /* A10: # GS threads loaded */
271 7, /* A9: Geometry Shader Stall Time - Core Stall */
272 -1, /* A8: (not available) */
273 6, /* A7: Geometry Shader Active Time */
274 5, /* A6: Vertex Shader ready but not running Time */
275 4, /* A5: # VS Threads Loaded */
276 17, /* A20: Early Stencil Test Pixels Failing */
277 16, /* A19: Early Stencil Test Pixels Passing */
278 15, /* A18: Early Z Test Pixels Failing */
279 14, /* A17: Early Z Test Pixels Passing */
280 13, /* A16: Pixel Shader ready but not running Time */
281 12, /* A15: # PS threads loaded */
282 11, /* A14: Pixel Shader Stall Time - Core Stall */
283 -1, /* A13: (not available) */
284 25, /* A28: SF active and stalled */
285 24, /* A27: CL active and not stalled */
286 23, /* A26: GPU Busy */
287 22, /* A25: Pixels/samples Written in the frame buffer */
288 21, /* A24: Post PS Z buffer Pixels Failed */
289 20, /* A23: Post PS Stencil Pixels Failed */
290 19, /* A22: Alpha Test Pixels Failed */
291 18, /* A21: Pixel Kill Count */
294 const static struct gl_perf_monitor_counter gen6_statistics_counters
[] = {
295 COUNTER64("IA_VERTICES_COUNT"),
296 COUNTER64("IA_PRIMITIVES_COUNT"),
297 COUNTER64("VS_INVOCATION_COUNT"),
298 COUNTER64("GS_INVOCATION_COUNT"),
299 COUNTER64("GS_PRIMITIVES_COUNT"),
300 COUNTER64("CL_INVOCATION_COUNT"),
301 COUNTER64("CL_PRIMITIVES_COUNT"),
302 COUNTER64("PS_INVOCATION_COUNT"),
303 COUNTER64("PS_DEPTH_COUNT"),
304 COUNTER64("SO_NUM_PRIMS_WRITTEN"),
305 COUNTER64("SO_PRIM_STORAGE_NEEDED"),
308 /** MMIO register addresses for each pipeline statistics counter. */
309 const static int gen6_statistics_register_addresses
[] = {
319 GEN6_SO_NUM_PRIMS_WRITTEN
,
320 GEN6_SO_PRIM_STORAGE_NEEDED
,
323 const static struct gl_perf_monitor_group gen6_groups
[] = {
324 GROUP("Observability Architecture Counters", INT_MAX
, gen6_raw_oa_counters
),
325 GROUP("Pipeline Statistics Registers", INT_MAX
, gen6_statistics_counters
),
330 * Ivybridge/Baytrail/Haswell:
333 const static struct gl_perf_monitor_counter gen7_raw_oa_counters
[] = {
334 COUNTER("Aggregated Core Array Active"),
335 COUNTER("Aggregated Core Array Stalled"),
336 COUNTER("Vertex Shader Active Time"),
337 COUNTER("Vertex Shader Stall Time - Core Stall"),
338 COUNTER("# VS threads loaded"),
339 COUNTER("Hull Shader Active Time"),
340 COUNTER("Hull Shader Stall Time - Core Stall"),
341 COUNTER("# HS threads loaded"),
342 COUNTER("Domain Shader Active Time"),
343 COUNTER("Domain Shader Stall Time - Core Stall"),
344 COUNTER("# DS threads loaded"),
345 COUNTER("Compute Shader Active Time"),
346 COUNTER("Compute Shader Stall Time - Core Stall"),
347 COUNTER("# CS threads loaded"),
348 COUNTER("Geometry Shader Active Time"),
349 COUNTER("Geometry Shader Stall Time - Core Stall"),
350 COUNTER("# GS threads loaded"),
351 COUNTER("Pixel Shader Active Time"),
352 COUNTER("Pixel Shader Stall Time - Core Stall"),
353 COUNTER("# PS threads loaded"),
354 COUNTER("HiZ Fast Z Test Pixels Passing"),
355 COUNTER("HiZ Fast Z Test Pixels Failing"),
356 COUNTER("Slow Z Test Pixels Passing"),
357 COUNTER("Slow Z Test Pixels Failing"),
358 COUNTER("Pixel Kill Count"),
359 COUNTER("Alpha Test Pixels Failed"),
360 COUNTER("Post PS Stencil Pixels Failed"),
361 COUNTER("Post PS Z buffer Pixels Failed"),
362 COUNTER("3D/GPGPU Render Target Writes"),
363 COUNTER("Render Engine Busy"),
364 COUNTER("VS bottleneck"),
365 COUNTER("GS bottleneck"),
369 * Ivybridge/Baytrail/Haswell: Counter Select = 101
370 * A4 A3 A2 A1 A0 TIMESTAMP ReportID
371 * A12 A11 A10 A9 A8 A7 A6 A5
372 * A20 A19 A18 A17 A16 A15 A14 A13
373 * A28 A27 A26 A25 A24 A23 A22 A21
374 * A36 A35 A34 A33 A32 A31 A30 A29
375 * A44 A43 A42 A41 A40 A39 A38 A37
376 * B7 B6 B5 B4 B3 B2 B1 B0
377 * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv
379 const static int gen7_oa_snapshot_layout
[] =
382 -1, /* TIMESTAMP (64-bit) */
383 -1, /* ...second half... */
384 0, /* A0: Aggregated Core Array Active */
385 1, /* A1: Aggregated Core Array Stalled */
386 2, /* A2: Vertex Shader Active Time */
387 -1, /* A3: Reserved */
388 3, /* A4: Vertex Shader Stall Time - Core Stall */
389 4, /* A5: # VS threads loaded */
390 -1, /* A6: Reserved */
391 5, /* A7: Hull Shader Active Time */
392 -1, /* A8: Reserved */
393 6, /* A9: Hull Shader Stall Time - Core Stall */
394 7, /* A10: # HS threads loaded */
395 -1, /* A11: Reserved */
396 8, /* A12: Domain Shader Active Time */
397 -1, /* A13: Reserved */
398 9, /* A14: Domain Shader Stall Time - Core Stall */
399 10, /* A15: # DS threads loaded */
400 -1, /* A16: Reserved */
401 11, /* A17: Compute Shader Active Time */
402 -1, /* A18: Reserved */
403 12, /* A19: Compute Shader Stall Time - Core Stall */
404 13, /* A20: # CS threads loaded */
405 -1, /* A21: Reserved */
406 14, /* A22: Geometry Shader Active Time */
407 -1, /* A23: Reserved */
408 15, /* A24: Geometry Shader Stall Time - Core Stall */
409 16, /* A25: # GS threads loaded */
410 -1, /* A26: Reserved */
411 17, /* A27: Pixel Shader Active Time */
412 -1, /* A28: Reserved */
413 18, /* A29: Pixel Shader Stall Time - Core Stall */
414 19, /* A30: # PS threads loaded */
415 -1, /* A31: Reserved */
416 20, /* A32: HiZ Fast Z Test Pixels Passing */
417 21, /* A33: HiZ Fast Z Test Pixels Failing */
418 22, /* A34: Slow Z Test Pixels Passing */
419 23, /* A35: Slow Z Test Pixels Failing */
420 24, /* A36: Pixel Kill Count */
421 25, /* A37: Alpha Test Pixels Failed */
422 26, /* A38: Post PS Stencil Pixels Failed */
423 27, /* A39: Post PS Z buffer Pixels Failed */
424 28, /* A40: 3D/GPGPU Render Target Writes */
425 29, /* A41: Render Engine Busy */
426 30, /* A42: VS bottleneck */
427 31, /* A43: GS bottleneck */
428 -1, /* A44: Reserved */
447 const static struct gl_perf_monitor_counter gen7_statistics_counters
[] = {
448 COUNTER64("IA_VERTICES_COUNT"),
449 COUNTER64("IA_PRIMITIVES_COUNT"),
450 COUNTER64("VS_INVOCATION_COUNT"),
451 COUNTER64("HS_INVOCATION_COUNT"),
452 COUNTER64("DS_INVOCATION_COUNT"),
453 COUNTER64("GS_INVOCATION_COUNT"),
454 COUNTER64("GS_PRIMITIVES_COUNT"),
455 COUNTER64("CL_INVOCATION_COUNT"),
456 COUNTER64("CL_PRIMITIVES_COUNT"),
457 COUNTER64("PS_INVOCATION_COUNT"),
458 COUNTER64("PS_DEPTH_COUNT"),
459 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
460 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
461 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
462 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
463 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
464 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
465 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
466 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
469 /** MMIO register addresses for each pipeline statistics counter. */
470 const static int gen7_statistics_register_addresses
[] = {
482 GEN7_SO_NUM_PRIMS_WRITTEN(0),
483 GEN7_SO_NUM_PRIMS_WRITTEN(1),
484 GEN7_SO_NUM_PRIMS_WRITTEN(2),
485 GEN7_SO_NUM_PRIMS_WRITTEN(3),
486 GEN7_SO_PRIM_STORAGE_NEEDED(0),
487 GEN7_SO_PRIM_STORAGE_NEEDED(1),
488 GEN7_SO_PRIM_STORAGE_NEEDED(2),
489 GEN7_SO_PRIM_STORAGE_NEEDED(3),
492 const static struct gl_perf_monitor_group gen7_groups
[] = {
493 GROUP("Observability Architecture Counters", INT_MAX
, gen7_raw_oa_counters
),
494 GROUP("Pipeline Statistics Registers", INT_MAX
, gen7_statistics_counters
),
498 /******************************************************************************/
500 static GLboolean
brw_is_perf_monitor_result_available(struct gl_context
*, struct gl_perf_monitor_object
*);
503 dump_perf_monitor_callback(GLuint name
, void *monitor_void
, void *brw_void
)
505 struct gl_context
*ctx
= brw_void
;
506 struct gl_perf_monitor_object
*m
= monitor_void
;
507 struct brw_perf_monitor_object
*monitor
= monitor_void
;
509 DBG("%4d %-7s %-6s %-11s %-9s\n",
511 m
->Active
? "Active" : "",
512 m
->Ended
? "Ended" : "",
513 brw_is_perf_monitor_result_available(ctx
, m
) ? "Available" : "",
514 monitor
->pipeline_stats_bo
? "Stats BO" : "");
518 brw_dump_perf_monitors(struct brw_context
*brw
)
520 struct gl_context
*ctx
= &brw
->ctx
;
521 DBG("Monitors: (OA users = %d)\n", brw
->perfmon
.oa_users
);
522 _mesa_HashWalk(ctx
->PerfMonitor
.Monitors
, dump_perf_monitor_callback
, brw
);
525 /******************************************************************************/
528 monitor_needs_statistics_registers(struct brw_context
*brw
,
529 struct gl_perf_monitor_object
*m
)
531 return brw
->gen
>= 6 && m
->ActiveGroups
[PIPELINE_STATS_COUNTERS
];
535 * Take a snapshot of any monitored pipeline statistics counters.
538 snapshot_statistics_registers(struct brw_context
*brw
,
539 struct brw_perf_monitor_object
*monitor
,
540 uint32_t offset_in_bytes
)
542 struct gl_context
*ctx
= &brw
->ctx
;
543 const int offset
= offset_in_bytes
/ sizeof(uint64_t);
544 const int group
= PIPELINE_STATS_COUNTERS
;
545 const int num_counters
= ctx
->PerfMonitor
.Groups
[group
].NumCounters
;
547 intel_batchbuffer_emit_mi_flush(brw
);
549 for (int i
= 0; i
< num_counters
; i
++) {
550 if (BITSET_TEST(monitor
->base
.ActiveCounters
[group
], i
)) {
551 assert(ctx
->PerfMonitor
.Groups
[group
].Counters
[i
].Type
==
552 GL_UNSIGNED_INT64_AMD
);
554 brw_store_register_mem64(brw
, monitor
->pipeline_stats_bo
,
555 brw
->perfmon
.statistics_registers
[i
],
562 * Gather results from pipeline_stats_bo, storing the final values.
564 * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
565 * smaller array of final results.
568 gather_statistics_results(struct brw_context
*brw
,
569 struct brw_perf_monitor_object
*monitor
)
571 struct gl_context
*ctx
= &brw
->ctx
;
572 const int num_counters
=
573 ctx
->PerfMonitor
.Groups
[PIPELINE_STATS_COUNTERS
].NumCounters
;
575 monitor
->pipeline_stats_results
= calloc(num_counters
, sizeof(uint64_t));
577 drm_intel_bo_map(monitor
->pipeline_stats_bo
, false);
578 uint64_t *start
= monitor
->pipeline_stats_bo
->virtual;
579 uint64_t *end
= start
+ (SECOND_SNAPSHOT_OFFSET_IN_BYTES
/ sizeof(uint64_t));
581 for (int i
= 0; i
< num_counters
; i
++) {
582 monitor
->pipeline_stats_results
[i
] = end
[i
] - start
[i
];
584 drm_intel_bo_unmap(monitor
->pipeline_stats_bo
);
585 drm_intel_bo_unreference(monitor
->pipeline_stats_bo
);
586 monitor
->pipeline_stats_bo
= NULL
;
589 /******************************************************************************/
592 monitor_needs_oa(struct brw_context
*brw
,
593 struct gl_perf_monitor_object
*m
)
595 return m
->ActiveGroups
[OA_COUNTERS
];
599 * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
600 * including the required PIPE_CONTROL flushes.
602 * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
603 * expands to three PIPE_CONTROLs which are 4 DWords each. We have to flush
604 * before and after MI_REPORT_PERF_COUNT, so multiply by two. Finally, add
605 * the 3 DWords for MI_REPORT_PERF_COUNT itself.
607 #define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
610 * Emit an MI_REPORT_PERF_COUNT command packet.
612 * This writes the current OA counter values to buffer.
615 emit_mi_report_perf_count(struct brw_context
*brw
,
617 uint32_t offset_in_bytes
,
620 assert(offset_in_bytes
% 64 == 0);
622 /* Make sure the commands to take a snapshot fits in a single batch. */
623 intel_batchbuffer_require_space(brw
, MI_REPORT_PERF_COUNT_BATCH_DWORDS
* 4,
625 int batch_used
= brw
->batch
.used
;
627 /* Reports apparently don't always get written unless we flush first. */
628 intel_batchbuffer_emit_mi_flush(brw
);
631 /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
632 * the counters. The report ID is ignored in the second set.
635 OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT
| GEN5_MI_COUNTER_SET_0
);
637 I915_GEM_DOMAIN_INSTRUCTION
, I915_GEM_DOMAIN_INSTRUCTION
,
639 OUT_BATCH(report_id
);
641 OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT
| GEN5_MI_COUNTER_SET_1
);
643 I915_GEM_DOMAIN_INSTRUCTION
, I915_GEM_DOMAIN_INSTRUCTION
,
644 offset_in_bytes
+ 64);
645 OUT_BATCH(report_id
);
647 } else if (brw
->gen
== 6) {
649 OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT
);
650 OUT_RELOC(bo
, I915_GEM_DOMAIN_INSTRUCTION
, I915_GEM_DOMAIN_INSTRUCTION
,
651 offset_in_bytes
| MI_COUNTER_ADDRESS_GTT
);
652 OUT_BATCH(report_id
);
654 } else if (brw
->gen
== 7) {
656 OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT
);
657 OUT_RELOC(bo
, I915_GEM_DOMAIN_INSTRUCTION
, I915_GEM_DOMAIN_INSTRUCTION
,
659 OUT_BATCH(report_id
);
662 assert(!"Unsupported generation for performance counters.");
665 /* Reports apparently don't always get written unless we flush after. */
666 intel_batchbuffer_emit_mi_flush(brw
);
669 assert(brw
->batch
.used
- batch_used
<= MI_REPORT_PERF_COUNT_BATCH_DWORDS
* 4);
672 /******************************************************************************/
675 * Initialize a monitor to sane starting state; throw away old buffers.
678 reinitialize_perf_monitor(struct brw_context
*brw
,
679 struct brw_perf_monitor_object
*monitor
)
681 if (monitor
->pipeline_stats_bo
) {
682 drm_intel_bo_unreference(monitor
->pipeline_stats_bo
);
683 monitor
->pipeline_stats_bo
= NULL
;
686 free(monitor
->pipeline_stats_results
);
687 monitor
->pipeline_stats_results
= NULL
;
691 * Driver hook for glBeginPerformanceMonitorAMD().
694 brw_begin_perf_monitor(struct gl_context
*ctx
,
695 struct gl_perf_monitor_object
*m
)
697 struct brw_context
*brw
= brw_context(ctx
);
698 struct brw_perf_monitor_object
*monitor
= brw_perf_monitor(m
);
700 DBG("Begin(%d)\n", m
->Name
);
702 reinitialize_perf_monitor(brw
, monitor
);
704 if (monitor_needs_oa(brw
, m
)) {
705 ++brw
->perfmon
.oa_users
;
708 if (monitor_needs_statistics_registers(brw
, m
)) {
709 monitor
->pipeline_stats_bo
=
710 drm_intel_bo_alloc(brw
->bufmgr
, "perf. monitor stats bo", 4096, 64);
712 /* Take starting snapshots. */
713 snapshot_statistics_registers(brw
, monitor
, 0);
720 * Driver hook for glEndPerformanceMonitorAMD().
723 brw_end_perf_monitor(struct gl_context
*ctx
,
724 struct gl_perf_monitor_object
*m
)
726 struct brw_context
*brw
= brw_context(ctx
);
727 struct brw_perf_monitor_object
*monitor
= brw_perf_monitor(m
);
729 DBG("End(%d)\n", m
->Name
);
731 if (monitor_needs_oa(brw
, m
)) {
732 --brw
->perfmon
.oa_users
;
735 if (monitor_needs_statistics_registers(brw
, m
)) {
736 /* Take ending snapshots. */
737 snapshot_statistics_registers(brw
, monitor
,
738 SECOND_SNAPSHOT_OFFSET_IN_BYTES
);
743 * Reset a performance monitor, throwing away any results.
746 brw_reset_perf_monitor(struct gl_context
*ctx
,
747 struct gl_perf_monitor_object
*m
)
749 struct brw_context
*brw
= brw_context(ctx
);
750 struct brw_perf_monitor_object
*monitor
= brw_perf_monitor(m
);
752 reinitialize_perf_monitor(brw
, monitor
);
755 brw_begin_perf_monitor(ctx
, m
);
760 * Is a performance monitor result available?
763 brw_is_perf_monitor_result_available(struct gl_context
*ctx
,
764 struct gl_perf_monitor_object
*m
)
766 struct brw_context
*brw
= brw_context(ctx
);
767 struct brw_perf_monitor_object
*monitor
= brw_perf_monitor(m
);
769 bool stats_available
= true;
771 if (monitor_needs_statistics_registers(brw
, m
)) {
772 stats_available
= !monitor
->pipeline_stats_bo
||
773 (!drm_intel_bo_references(brw
->batch
.bo
, monitor
->pipeline_stats_bo
) &&
774 !drm_intel_bo_busy(monitor
->pipeline_stats_bo
));
777 return stats_available
;
781 * Get the performance monitor result.
784 brw_get_perf_monitor_result(struct gl_context
*ctx
,
785 struct gl_perf_monitor_object
*m
,
788 GLint
*bytes_written
)
790 struct brw_context
*brw
= brw_context(ctx
);
791 struct brw_perf_monitor_object
*monitor
= brw_perf_monitor(m
);
793 DBG("GetResult(%d)\n", m
->Name
);
794 brw_dump_perf_monitors(brw
);
796 /* This hook should only be called when results are available. */
799 /* Copy data to the supplied array (data).
801 * The output data format is: <group ID, counter ID, value> for each
802 * active counter. The API allows counters to appear in any order.
806 if (monitor_needs_statistics_registers(brw
, m
)) {
807 const int num_counters
=
808 ctx
->PerfMonitor
.Groups
[PIPELINE_STATS_COUNTERS
].NumCounters
;
810 if (!monitor
->pipeline_stats_results
)
811 gather_statistics_results(brw
, monitor
);
813 for (int i
= 0; i
< num_counters
; i
++) {
814 if (BITSET_TEST(m
->ActiveCounters
[PIPELINE_STATS_COUNTERS
], i
)) {
815 data
[offset
++] = PIPELINE_STATS_COUNTERS
;
817 *((uint64_t *) (&data
[offset
])) = monitor
->pipeline_stats_results
[i
];
824 *bytes_written
= offset
* sizeof(uint32_t);
828 * Create a new performance monitor object.
830 static struct gl_perf_monitor_object
*
831 brw_new_perf_monitor(struct gl_context
*ctx
)
833 return calloc(1, sizeof(struct brw_perf_monitor_object
));
837 * Delete a performance monitor object.
840 brw_delete_perf_monitor(struct gl_context
*ctx
, struct gl_perf_monitor_object
*m
)
842 struct brw_perf_monitor_object
*monitor
= brw_perf_monitor(m
);
843 DBG("Delete(%d)\n", m
->Name
);
844 reinitialize_perf_monitor(brw_context(ctx
), monitor
);
848 /******************************************************************************/
851 brw_init_performance_monitors(struct brw_context
*brw
)
853 struct gl_context
*ctx
= &brw
->ctx
;
855 ctx
->Driver
.NewPerfMonitor
= brw_new_perf_monitor
;
856 ctx
->Driver
.DeletePerfMonitor
= brw_delete_perf_monitor
;
857 ctx
->Driver
.BeginPerfMonitor
= brw_begin_perf_monitor
;
858 ctx
->Driver
.EndPerfMonitor
= brw_end_perf_monitor
;
859 ctx
->Driver
.ResetPerfMonitor
= brw_reset_perf_monitor
;
860 ctx
->Driver
.IsPerfMonitorResultAvailable
= brw_is_perf_monitor_result_available
;
861 ctx
->Driver
.GetPerfMonitorResult
= brw_get_perf_monitor_result
;
864 ctx
->PerfMonitor
.Groups
= gen5_groups
;
865 ctx
->PerfMonitor
.NumGroups
= ARRAY_SIZE(gen5_groups
);
866 brw
->perfmon
.oa_snapshot_layout
= gen5_oa_snapshot_layout
;
867 brw
->perfmon
.entries_per_oa_snapshot
= ARRAY_SIZE(gen5_oa_snapshot_layout
);
868 } else if (brw
->gen
== 6) {
869 ctx
->PerfMonitor
.Groups
= gen6_groups
;
870 ctx
->PerfMonitor
.NumGroups
= ARRAY_SIZE(gen6_groups
);
871 brw
->perfmon
.oa_snapshot_layout
= gen6_oa_snapshot_layout
;
872 brw
->perfmon
.entries_per_oa_snapshot
= ARRAY_SIZE(gen6_oa_snapshot_layout
);
873 brw
->perfmon
.statistics_registers
= gen6_statistics_register_addresses
;
874 } else if (brw
->gen
== 7) {
875 ctx
->PerfMonitor
.Groups
= gen7_groups
;
876 ctx
->PerfMonitor
.NumGroups
= ARRAY_SIZE(gen7_groups
);
877 brw
->perfmon
.oa_snapshot_layout
= gen7_oa_snapshot_layout
;
878 brw
->perfmon
.entries_per_oa_snapshot
= ARRAY_SIZE(gen7_oa_snapshot_layout
);
879 brw
->perfmon
.statistics_registers
= gen7_statistics_register_addresses
;