e5d214e968861feb74d1647249bdfdc0fefe2e27
[mesa.git] / src / mesa / drivers / dri / i965 / brw_performance_monitor.c
1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file brw_performance_monitor.c
26 *
27 * Implementation of the GL_AMD_performance_monitor extension.
28 *
29 * On Gen5+ hardware, we have two sources of performance counter data:
30 * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
31 * the Pipeline Statistics Registers. We expose both sets of raw data,
32 * as well as some useful processed values.
33 *
34 * The Observability Architecture (OA) counters for Gen6+ are documented
35 * in a separate document from the rest of the PRMs. It is available at:
36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms
37 * => 2013 Intel Core Processor Family => Observability Performance Counters
38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
39 *
40 * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public
41 * documentation exists; our implementation is based on the source code for the
42 * intel_perf_counters utility (which is available as part of intel-gpu-tools).
43 */
44
45 #include <limits.h>
46
47 #include "main/bitset.h"
48 #include "main/hash.h"
49 #include "main/macros.h"
50 #include "main/mtypes.h"
51 #include "main/performance_monitor.h"
52
53 #include "glsl/ralloc.h"
54
55 #include "brw_context.h"
56 #include "brw_defines.h"
57 #include "intel_batchbuffer.h"
58
59 #define FILE_DEBUG_FLAG DEBUG_PERFMON
60
61 /**
62 * i965 representation of a performance monitor object.
63 */
64 struct brw_perf_monitor_object
65 {
66 /** The base class. */
67 struct gl_perf_monitor_object base;
68
69 /**
70 * BO containing starting and ending snapshots for any active pipeline
71 * statistics counters.
72 */
73 drm_intel_bo *pipeline_stats_bo;
74
75 /**
76 * Storage for final pipeline statistics counter results.
77 */
78 uint64_t *pipeline_stats_results;
79 };
80
81 /** Downcasting convenience macro. */
82 static inline struct brw_perf_monitor_object *
83 brw_perf_monitor(struct gl_perf_monitor_object *m)
84 {
85 return (struct brw_perf_monitor_object *) m;
86 }
87
88 #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
89
90 /******************************************************************************/
91
92 #define COUNTER(name) \
93 { \
94 .Name = name, \
95 .Type = GL_UNSIGNED_INT, \
96 .Minimum = { .u32 = 0 }, \
97 .Maximum = { .u32 = ~0 }, \
98 }
99
100 #define COUNTER64(name) \
101 { \
102 .Name = name, \
103 .Type = GL_UNSIGNED_INT64_AMD, \
104 .Minimum = { .u64 = 0 }, \
105 .Maximum = { .u64 = ~0 }, \
106 }
107
108 #define GROUP(name, max_active, counter_list) \
109 { \
110 .Name = name, \
111 .MaxActiveCounters = max_active, \
112 .Counters = counter_list, \
113 .NumCounters = ARRAY_SIZE(counter_list), \
114 }
115
116 /** Performance Monitor Group IDs */
117 enum brw_counter_groups {
118 OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
119 PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */
120 };
121
122 /**
123 * Ironlake:
124 * @{
125 *
126 * The list of CHAPS counters unfortunately does not appear in any public
127 * documentation, but is available by reading the source code for the
128 * intel_perf_counters utility (shipped as part of intel-gpu-tools).
129 */
130 const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
131 COUNTER("cycles the CS unit is starved"),
132 COUNTER("cycles the CS unit is stalled"),
133 COUNTER("cycles the VF unit is starved"),
134 COUNTER("cycles the VF unit is stalled"),
135 COUNTER("cycles the VS unit is starved"),
136 COUNTER("cycles the VS unit is stalled"),
137 COUNTER("cycles the GS unit is starved"),
138 COUNTER("cycles the GS unit is stalled"),
139 COUNTER("cycles the CL unit is starved"),
140 COUNTER("cycles the CL unit is stalled"),
141 COUNTER("cycles the SF unit is starved"),
142 COUNTER("cycles the SF unit is stalled"),
143 COUNTER("cycles the WZ unit is starved"),
144 COUNTER("cycles the WZ unit is stalled"),
145 COUNTER("Z buffer read/write"),
146 COUNTER("cycles each EU was active"),
147 COUNTER("cycles each EU was suspended"),
148 COUNTER("cycles threads loaded all EUs"),
149 COUNTER("cycles filtering active"),
150 COUNTER("cycles PS threads executed"),
151 COUNTER("subspans written to RC"),
152 COUNTER("bytes read for texture reads"),
153 COUNTER("texels returned from sampler"),
154 COUNTER("polygons not culled"),
155 COUNTER("clocks MASF has valid message"),
156 COUNTER("64b writes/reads from RC"),
157 COUNTER("reads on dataport"),
158 COUNTER("clocks MASF has valid msg not consumed by sampler"),
159 COUNTER("cycles any EU is stalled for math"),
160 };
161
162 const static int gen5_oa_snapshot_layout[] =
163 {
164 -1, /* Report ID */
165 -1, /* TIMESTAMP (64-bit) */
166 -1, /* ...second half... */
167 0, /* cycles the CS unit is starved */
168 1, /* cycles the CS unit is stalled */
169 2, /* cycles the VF unit is starved */
170 3, /* cycles the VF unit is stalled */
171 4, /* cycles the VS unit is starved */
172 5, /* cycles the VS unit is stalled */
173 6, /* cycles the GS unit is starved */
174 7, /* cycles the GS unit is stalled */
175 8, /* cycles the CL unit is starved */
176 9, /* cycles the CL unit is stalled */
177 10, /* cycles the SF unit is starved */
178 11, /* cycles the SF unit is stalled */
179 12, /* cycles the WZ unit is starved */
180 13, /* cycles the WZ unit is stalled */
181 14, /* Z buffer read/write */
182 15, /* cycles each EU was active */
183 16, /* cycles each EU was suspended */
184 17, /* cycles threads loaded all EUs */
185 18, /* cycles filtering active */
186 19, /* cycles PS threads executed */
187 20, /* subspans written to RC */
188 21, /* bytes read for texture reads */
189 22, /* texels returned from sampler */
190 23, /* polygons not culled */
191 24, /* clocks MASF has valid message */
192 25, /* 64b writes/reads from RC */
193 26, /* reads on dataport */
194 27, /* clocks MASF has valid msg not consumed by sampler */
195 28, /* cycles any EU is stalled for math */
196 };
197
198 const static struct gl_perf_monitor_group gen5_groups[] = {
199 [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
200 /* Our pipeline statistics counter handling requires hardware contexts. */
201 };
202 /** @} */
203
204 /**
205 * Sandybridge:
206 * @{
207 *
208 * A few of the counters here (A17-A20) are not included in the latest
209 * documentation, but are described in the Ironlake PRM (which strangely
210 * documents Sandybridge's performance counter system, not Ironlake's).
211 * It's unclear whether they work or not; empirically, they appear to.
212 */
213
214 /**
215 * Aggregating counters A0-A28:
216 */
217 const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
218 /* A0: 0 */ COUNTER("Aggregated Core Array Active"),
219 /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"),
220 /* A2: 2 */ COUNTER("Vertex Shader Active Time"),
221 /* A3: Not actually hooked up on Sandybridge. */
222 /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
223 /* A5: 4 */ COUNTER("# VS threads loaded"),
224 /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"),
225 /* A7: 6 */ COUNTER("Geometry Shader Active Time"),
226 /* A8: Not actually hooked up on Sandybridge. */
227 /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
228 /* A10: 8 */ COUNTER("# GS threads loaded"),
229 /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"),
230 /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
231 /* A13: Not actually hooked up on Sandybridge. */
232 /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
233 /* A15: 12 */ COUNTER("# PS threads loaded"),
234 /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
235 /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
236 /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
237 /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
238 /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
239 /* A21: 18 */ COUNTER("Pixel Kill Count"),
240 /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
241 /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
242 /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
243 /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
244 /* A26: 23 */ COUNTER("GPU Busy"),
245 /* A27: 24 */ COUNTER("CL active and not stalled"),
246 /* A28: 25 */ COUNTER("SF active and stalled"),
247 };
248
249 /**
250 * Sandybridge: Counter Select = 001
251 * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID
252 * A5 A6 A7 A8 A9 A10 A11 A12
253 * A13 A14 A15 A16 A17 A18 A19 A20
254 * A21 A22 A23 A24 A25 A26 A27 A28
255 *
256 * (Yes, this is a strange order.) We also have to remap for missing counters.
257 */
258 const static int gen6_oa_snapshot_layout[] =
259 {
260 -1, /* Report ID */
261 -1, /* TIMESTAMP (64-bit) */
262 -1, /* ...second half... */
263 3, /* A4: Vertex Shader Stall Time - Core Stall */
264 -1, /* A3: (not available) */
265 2, /* A2: Vertex Shader Active Time */
266 1, /* A1: Aggregated Core Array Stalled */
267 0, /* A0: Aggregated Core Array Active */
268 10, /* A12: Pixel Shader Active Time */
269 9, /* A11: Geometry Shader ready but not running Time */
270 8, /* A10: # GS threads loaded */
271 7, /* A9: Geometry Shader Stall Time - Core Stall */
272 -1, /* A8: (not available) */
273 6, /* A7: Geometry Shader Active Time */
274 5, /* A6: Vertex Shader ready but not running Time */
275 4, /* A5: # VS Threads Loaded */
276 17, /* A20: Early Stencil Test Pixels Failing */
277 16, /* A19: Early Stencil Test Pixels Passing */
278 15, /* A18: Early Z Test Pixels Failing */
279 14, /* A17: Early Z Test Pixels Passing */
280 13, /* A16: Pixel Shader ready but not running Time */
281 12, /* A15: # PS threads loaded */
282 11, /* A14: Pixel Shader Stall Time - Core Stall */
283 -1, /* A13: (not available) */
284 25, /* A28: SF active and stalled */
285 24, /* A27: CL active and not stalled */
286 23, /* A26: GPU Busy */
287 22, /* A25: Pixels/samples Written in the frame buffer */
288 21, /* A24: Post PS Z buffer Pixels Failed */
289 20, /* A23: Post PS Stencil Pixels Failed */
290 19, /* A22: Alpha Test Pixels Failed */
291 18, /* A21: Pixel Kill Count */
292 };
293
294 const static struct gl_perf_monitor_counter gen6_statistics_counters[] = {
295 COUNTER64("IA_VERTICES_COUNT"),
296 COUNTER64("IA_PRIMITIVES_COUNT"),
297 COUNTER64("VS_INVOCATION_COUNT"),
298 COUNTER64("GS_INVOCATION_COUNT"),
299 COUNTER64("GS_PRIMITIVES_COUNT"),
300 COUNTER64("CL_INVOCATION_COUNT"),
301 COUNTER64("CL_PRIMITIVES_COUNT"),
302 COUNTER64("PS_INVOCATION_COUNT"),
303 COUNTER64("PS_DEPTH_COUNT"),
304 COUNTER64("SO_NUM_PRIMS_WRITTEN"),
305 COUNTER64("SO_PRIM_STORAGE_NEEDED"),
306 };
307
308 /** MMIO register addresses for each pipeline statistics counter. */
309 const static int gen6_statistics_register_addresses[] = {
310 IA_VERTICES_COUNT,
311 IA_PRIMITIVES_COUNT,
312 VS_INVOCATION_COUNT,
313 GS_INVOCATION_COUNT,
314 GS_PRIMITIVES_COUNT,
315 CL_INVOCATION_COUNT,
316 CL_PRIMITIVES_COUNT,
317 PS_INVOCATION_COUNT,
318 PS_DEPTH_COUNT,
319 GEN6_SO_NUM_PRIMS_WRITTEN,
320 GEN6_SO_PRIM_STORAGE_NEEDED,
321 };
322
323 const static struct gl_perf_monitor_group gen6_groups[] = {
324 GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
325 GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
326 };
327 /** @} */
328
329 /**
330 * Ivybridge/Baytrail/Haswell:
331 * @{
332 */
333 const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
334 COUNTER("Aggregated Core Array Active"),
335 COUNTER("Aggregated Core Array Stalled"),
336 COUNTER("Vertex Shader Active Time"),
337 COUNTER("Vertex Shader Stall Time - Core Stall"),
338 COUNTER("# VS threads loaded"),
339 COUNTER("Hull Shader Active Time"),
340 COUNTER("Hull Shader Stall Time - Core Stall"),
341 COUNTER("# HS threads loaded"),
342 COUNTER("Domain Shader Active Time"),
343 COUNTER("Domain Shader Stall Time - Core Stall"),
344 COUNTER("# DS threads loaded"),
345 COUNTER("Compute Shader Active Time"),
346 COUNTER("Compute Shader Stall Time - Core Stall"),
347 COUNTER("# CS threads loaded"),
348 COUNTER("Geometry Shader Active Time"),
349 COUNTER("Geometry Shader Stall Time - Core Stall"),
350 COUNTER("# GS threads loaded"),
351 COUNTER("Pixel Shader Active Time"),
352 COUNTER("Pixel Shader Stall Time - Core Stall"),
353 COUNTER("# PS threads loaded"),
354 COUNTER("HiZ Fast Z Test Pixels Passing"),
355 COUNTER("HiZ Fast Z Test Pixels Failing"),
356 COUNTER("Slow Z Test Pixels Passing"),
357 COUNTER("Slow Z Test Pixels Failing"),
358 COUNTER("Pixel Kill Count"),
359 COUNTER("Alpha Test Pixels Failed"),
360 COUNTER("Post PS Stencil Pixels Failed"),
361 COUNTER("Post PS Z buffer Pixels Failed"),
362 COUNTER("3D/GPGPU Render Target Writes"),
363 COUNTER("Render Engine Busy"),
364 COUNTER("VS bottleneck"),
365 COUNTER("GS bottleneck"),
366 };
367
368 /**
369 * Ivybridge/Baytrail/Haswell: Counter Select = 101
370 * A4 A3 A2 A1 A0 TIMESTAMP ReportID
371 * A12 A11 A10 A9 A8 A7 A6 A5
372 * A20 A19 A18 A17 A16 A15 A14 A13
373 * A28 A27 A26 A25 A24 A23 A22 A21
374 * A36 A35 A34 A33 A32 A31 A30 A29
375 * A44 A43 A42 A41 A40 A39 A38 A37
376 * B7 B6 B5 B4 B3 B2 B1 B0
377 * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv
378 */
379 const static int gen7_oa_snapshot_layout[] =
380 {
381 -1, /* Report ID */
382 -1, /* TIMESTAMP (64-bit) */
383 -1, /* ...second half... */
384 0, /* A0: Aggregated Core Array Active */
385 1, /* A1: Aggregated Core Array Stalled */
386 2, /* A2: Vertex Shader Active Time */
387 -1, /* A3: Reserved */
388 3, /* A4: Vertex Shader Stall Time - Core Stall */
389 4, /* A5: # VS threads loaded */
390 -1, /* A6: Reserved */
391 5, /* A7: Hull Shader Active Time */
392 -1, /* A8: Reserved */
393 6, /* A9: Hull Shader Stall Time - Core Stall */
394 7, /* A10: # HS threads loaded */
395 -1, /* A11: Reserved */
396 8, /* A12: Domain Shader Active Time */
397 -1, /* A13: Reserved */
398 9, /* A14: Domain Shader Stall Time - Core Stall */
399 10, /* A15: # DS threads loaded */
400 -1, /* A16: Reserved */
401 11, /* A17: Compute Shader Active Time */
402 -1, /* A18: Reserved */
403 12, /* A19: Compute Shader Stall Time - Core Stall */
404 13, /* A20: # CS threads loaded */
405 -1, /* A21: Reserved */
406 14, /* A22: Geometry Shader Active Time */
407 -1, /* A23: Reserved */
408 15, /* A24: Geometry Shader Stall Time - Core Stall */
409 16, /* A25: # GS threads loaded */
410 -1, /* A26: Reserved */
411 17, /* A27: Pixel Shader Active Time */
412 -1, /* A28: Reserved */
413 18, /* A29: Pixel Shader Stall Time - Core Stall */
414 19, /* A30: # PS threads loaded */
415 -1, /* A31: Reserved */
416 20, /* A32: HiZ Fast Z Test Pixels Passing */
417 21, /* A33: HiZ Fast Z Test Pixels Failing */
418 22, /* A34: Slow Z Test Pixels Passing */
419 23, /* A35: Slow Z Test Pixels Failing */
420 24, /* A36: Pixel Kill Count */
421 25, /* A37: Alpha Test Pixels Failed */
422 26, /* A38: Post PS Stencil Pixels Failed */
423 27, /* A39: Post PS Z buffer Pixels Failed */
424 28, /* A40: 3D/GPGPU Render Target Writes */
425 29, /* A41: Render Engine Busy */
426 30, /* A42: VS bottleneck */
427 31, /* A43: GS bottleneck */
428 -1, /* A44: Reserved */
429 -1, /* B0 */
430 -1, /* B1 */
431 -1, /* B2 */
432 -1, /* B3 */
433 -1, /* B4 */
434 -1, /* B5 */
435 -1, /* B6 */
436 -1, /* B7 */
437 -1, /* Reserved */
438 -1, /* Reserved */
439 -1, /* Reserved */
440 -1, /* Reserved */
441 -1, /* Reserved */
442 -1, /* Reserved */
443 -1, /* Reserved */
444 -1, /* Reserved */
445 };
446
447 const static struct gl_perf_monitor_counter gen7_statistics_counters[] = {
448 COUNTER64("IA_VERTICES_COUNT"),
449 COUNTER64("IA_PRIMITIVES_COUNT"),
450 COUNTER64("VS_INVOCATION_COUNT"),
451 COUNTER64("HS_INVOCATION_COUNT"),
452 COUNTER64("DS_INVOCATION_COUNT"),
453 COUNTER64("GS_INVOCATION_COUNT"),
454 COUNTER64("GS_PRIMITIVES_COUNT"),
455 COUNTER64("CL_INVOCATION_COUNT"),
456 COUNTER64("CL_PRIMITIVES_COUNT"),
457 COUNTER64("PS_INVOCATION_COUNT"),
458 COUNTER64("PS_DEPTH_COUNT"),
459 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
460 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
461 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
462 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
463 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
464 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
465 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
466 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
467 };
468
469 /** MMIO register addresses for each pipeline statistics counter. */
470 const static int gen7_statistics_register_addresses[] = {
471 IA_VERTICES_COUNT,
472 IA_PRIMITIVES_COUNT,
473 VS_INVOCATION_COUNT,
474 HS_INVOCATION_COUNT,
475 DS_INVOCATION_COUNT,
476 GS_INVOCATION_COUNT,
477 GS_PRIMITIVES_COUNT,
478 CL_INVOCATION_COUNT,
479 CL_PRIMITIVES_COUNT,
480 PS_INVOCATION_COUNT,
481 PS_DEPTH_COUNT,
482 GEN7_SO_NUM_PRIMS_WRITTEN(0),
483 GEN7_SO_NUM_PRIMS_WRITTEN(1),
484 GEN7_SO_NUM_PRIMS_WRITTEN(2),
485 GEN7_SO_NUM_PRIMS_WRITTEN(3),
486 GEN7_SO_PRIM_STORAGE_NEEDED(0),
487 GEN7_SO_PRIM_STORAGE_NEEDED(1),
488 GEN7_SO_PRIM_STORAGE_NEEDED(2),
489 GEN7_SO_PRIM_STORAGE_NEEDED(3),
490 };
491
492 const static struct gl_perf_monitor_group gen7_groups[] = {
493 GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
494 GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
495 };
496 /** @} */
497
498 /******************************************************************************/
499
500 static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *);
501
502 static void
503 dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
504 {
505 struct gl_context *ctx = brw_void;
506 struct gl_perf_monitor_object *m = monitor_void;
507 struct brw_perf_monitor_object *monitor = monitor_void;
508
509 DBG("%4d %-7s %-6s %-11s %-9s\n",
510 name,
511 m->Active ? "Active" : "",
512 m->Ended ? "Ended" : "",
513 brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
514 monitor->pipeline_stats_bo ? "Stats BO" : "");
515 }
516
517 void
518 brw_dump_perf_monitors(struct brw_context *brw)
519 {
520 struct gl_context *ctx = &brw->ctx;
521 DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users);
522 _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw);
523 }
524
525 /******************************************************************************/
526
527 static bool
528 monitor_needs_statistics_registers(struct brw_context *brw,
529 struct gl_perf_monitor_object *m)
530 {
531 return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS];
532 }
533
534 /**
535 * Take a snapshot of any monitored pipeline statistics counters.
536 */
537 static void
538 snapshot_statistics_registers(struct brw_context *brw,
539 struct brw_perf_monitor_object *monitor,
540 uint32_t offset_in_bytes)
541 {
542 struct gl_context *ctx = &brw->ctx;
543 const int offset = offset_in_bytes / sizeof(uint64_t);
544 const int group = PIPELINE_STATS_COUNTERS;
545 const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
546
547 intel_batchbuffer_emit_mi_flush(brw);
548
549 for (int i = 0; i < num_counters; i++) {
550 if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
551 assert(ctx->PerfMonitor.Groups[group].Counters[i].Type ==
552 GL_UNSIGNED_INT64_AMD);
553
554 brw_store_register_mem64(brw, monitor->pipeline_stats_bo,
555 brw->perfmon.statistics_registers[i],
556 offset + i);
557 }
558 }
559 }
560
561 /**
562 * Gather results from pipeline_stats_bo, storing the final values.
563 *
564 * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
565 * smaller array of final results.
566 */
567 static void
568 gather_statistics_results(struct brw_context *brw,
569 struct brw_perf_monitor_object *monitor)
570 {
571 struct gl_context *ctx = &brw->ctx;
572 const int num_counters =
573 ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
574
575 monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
576
577 drm_intel_bo_map(monitor->pipeline_stats_bo, false);
578 uint64_t *start = monitor->pipeline_stats_bo->virtual;
579 uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
580
581 for (int i = 0; i < num_counters; i++) {
582 monitor->pipeline_stats_results[i] = end[i] - start[i];
583 }
584 drm_intel_bo_unmap(monitor->pipeline_stats_bo);
585 drm_intel_bo_unreference(monitor->pipeline_stats_bo);
586 monitor->pipeline_stats_bo = NULL;
587 }
588
589 /******************************************************************************/
590
591 static bool
592 monitor_needs_oa(struct brw_context *brw,
593 struct gl_perf_monitor_object *m)
594 {
595 return m->ActiveGroups[OA_COUNTERS];
596 }
597
598 /**
599 * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
600 * including the required PIPE_CONTROL flushes.
601 *
602 * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
603 * expands to three PIPE_CONTROLs which are 4 DWords each. We have to flush
604 * before and after MI_REPORT_PERF_COUNT, so multiply by two. Finally, add
605 * the 3 DWords for MI_REPORT_PERF_COUNT itself.
606 */
607 #define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
608
609 /**
610 * Emit an MI_REPORT_PERF_COUNT command packet.
611 *
612 * This writes the current OA counter values to buffer.
613 */
614 static void
615 emit_mi_report_perf_count(struct brw_context *brw,
616 drm_intel_bo *bo,
617 uint32_t offset_in_bytes,
618 uint32_t report_id)
619 {
620 assert(offset_in_bytes % 64 == 0);
621
622 /* Make sure the commands to take a snapshot fits in a single batch. */
623 intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
624 RENDER_RING);
625 int batch_used = brw->batch.used;
626
627 /* Reports apparently don't always get written unless we flush first. */
628 intel_batchbuffer_emit_mi_flush(brw);
629
630 if (brw->gen == 5) {
631 /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
632 * the counters. The report ID is ignored in the second set.
633 */
634 BEGIN_BATCH(6);
635 OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
636 OUT_RELOC(bo,
637 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
638 offset_in_bytes);
639 OUT_BATCH(report_id);
640
641 OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
642 OUT_RELOC(bo,
643 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
644 offset_in_bytes + 64);
645 OUT_BATCH(report_id);
646 ADVANCE_BATCH();
647 } else if (brw->gen == 6) {
648 BEGIN_BATCH(3);
649 OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
650 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
651 offset_in_bytes | MI_COUNTER_ADDRESS_GTT);
652 OUT_BATCH(report_id);
653 ADVANCE_BATCH();
654 } else if (brw->gen == 7) {
655 BEGIN_BATCH(3);
656 OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
657 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
658 offset_in_bytes);
659 OUT_BATCH(report_id);
660 ADVANCE_BATCH();
661 } else {
662 assert(!"Unsupported generation for performance counters.");
663 }
664
665 /* Reports apparently don't always get written unless we flush after. */
666 intel_batchbuffer_emit_mi_flush(brw);
667
668 (void) batch_used;
669 assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
670 }
671
672 /******************************************************************************/
673
674 /**
675 * Initialize a monitor to sane starting state; throw away old buffers.
676 */
677 static void
678 reinitialize_perf_monitor(struct brw_context *brw,
679 struct brw_perf_monitor_object *monitor)
680 {
681 if (monitor->pipeline_stats_bo) {
682 drm_intel_bo_unreference(monitor->pipeline_stats_bo);
683 monitor->pipeline_stats_bo = NULL;
684 }
685
686 free(monitor->pipeline_stats_results);
687 monitor->pipeline_stats_results = NULL;
688 }
689
690 /**
691 * Driver hook for glBeginPerformanceMonitorAMD().
692 */
693 static GLboolean
694 brw_begin_perf_monitor(struct gl_context *ctx,
695 struct gl_perf_monitor_object *m)
696 {
697 struct brw_context *brw = brw_context(ctx);
698 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
699
700 DBG("Begin(%d)\n", m->Name);
701
702 reinitialize_perf_monitor(brw, monitor);
703
704 if (monitor_needs_oa(brw, m)) {
705 ++brw->perfmon.oa_users;
706 }
707
708 if (monitor_needs_statistics_registers(brw, m)) {
709 monitor->pipeline_stats_bo =
710 drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
711
712 /* Take starting snapshots. */
713 snapshot_statistics_registers(brw, monitor, 0);
714 }
715
716 return true;
717 }
718
719 /**
720 * Driver hook for glEndPerformanceMonitorAMD().
721 */
722 static void
723 brw_end_perf_monitor(struct gl_context *ctx,
724 struct gl_perf_monitor_object *m)
725 {
726 struct brw_context *brw = brw_context(ctx);
727 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
728
729 DBG("End(%d)\n", m->Name);
730
731 if (monitor_needs_oa(brw, m)) {
732 --brw->perfmon.oa_users;
733 }
734
735 if (monitor_needs_statistics_registers(brw, m)) {
736 /* Take ending snapshots. */
737 snapshot_statistics_registers(brw, monitor,
738 SECOND_SNAPSHOT_OFFSET_IN_BYTES);
739 }
740 }
741
742 /**
743 * Reset a performance monitor, throwing away any results.
744 */
745 static void
746 brw_reset_perf_monitor(struct gl_context *ctx,
747 struct gl_perf_monitor_object *m)
748 {
749 struct brw_context *brw = brw_context(ctx);
750 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
751
752 reinitialize_perf_monitor(brw, monitor);
753
754 if (m->Active) {
755 brw_begin_perf_monitor(ctx, m);
756 }
757 }
758
759 /**
760 * Is a performance monitor result available?
761 */
762 static GLboolean
763 brw_is_perf_monitor_result_available(struct gl_context *ctx,
764 struct gl_perf_monitor_object *m)
765 {
766 struct brw_context *brw = brw_context(ctx);
767 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
768
769 bool stats_available = true;
770
771 if (monitor_needs_statistics_registers(brw, m)) {
772 stats_available = !monitor->pipeline_stats_bo ||
773 (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
774 !drm_intel_bo_busy(monitor->pipeline_stats_bo));
775 }
776
777 return stats_available;
778 }
779
780 /**
781 * Get the performance monitor result.
782 */
783 static void
784 brw_get_perf_monitor_result(struct gl_context *ctx,
785 struct gl_perf_monitor_object *m,
786 GLsizei data_size,
787 GLuint *data,
788 GLint *bytes_written)
789 {
790 struct brw_context *brw = brw_context(ctx);
791 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
792
793 DBG("GetResult(%d)\n", m->Name);
794 brw_dump_perf_monitors(brw);
795
796 /* This hook should only be called when results are available. */
797 assert(m->Ended);
798
799 /* Copy data to the supplied array (data).
800 *
801 * The output data format is: <group ID, counter ID, value> for each
802 * active counter. The API allows counters to appear in any order.
803 */
804 GLsizei offset = 0;
805
806 if (monitor_needs_statistics_registers(brw, m)) {
807 const int num_counters =
808 ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
809
810 if (!monitor->pipeline_stats_results)
811 gather_statistics_results(brw, monitor);
812
813 for (int i = 0; i < num_counters; i++) {
814 if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
815 data[offset++] = PIPELINE_STATS_COUNTERS;
816 data[offset++] = i;
817 *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
818 offset += 2;
819 }
820 }
821 }
822
823 if (bytes_written)
824 *bytes_written = offset * sizeof(uint32_t);
825 }
826
827 /**
828 * Create a new performance monitor object.
829 */
830 static struct gl_perf_monitor_object *
831 brw_new_perf_monitor(struct gl_context *ctx)
832 {
833 return calloc(1, sizeof(struct brw_perf_monitor_object));
834 }
835
836 /**
837 * Delete a performance monitor object.
838 */
839 static void
840 brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
841 {
842 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
843 DBG("Delete(%d)\n", m->Name);
844 reinitialize_perf_monitor(brw_context(ctx), monitor);
845 free(monitor);
846 }
847
848 /******************************************************************************/
849
850 void
851 brw_init_performance_monitors(struct brw_context *brw)
852 {
853 struct gl_context *ctx = &brw->ctx;
854
855 ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
856 ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
857 ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
858 ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
859 ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
860 ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
861 ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
862
863 if (brw->gen == 5) {
864 ctx->PerfMonitor.Groups = gen5_groups;
865 ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
866 brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout;
867 brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout);
868 } else if (brw->gen == 6) {
869 ctx->PerfMonitor.Groups = gen6_groups;
870 ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups);
871 brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout;
872 brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout);
873 brw->perfmon.statistics_registers = gen6_statistics_register_addresses;
874 } else if (brw->gen == 7) {
875 ctx->PerfMonitor.Groups = gen7_groups;
876 ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups);
877 brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout;
878 brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
879 brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
880 }
881 }