glsl: Lower UBO and SSBO access in glsl linker
[mesa.git] / src / mesa / drivers / dri / i965 / brw_performance_monitor.c
1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file brw_performance_monitor.c
26 *
27 * Implementation of the GL_AMD_performance_monitor extension.
28 *
29 * On Gen5+ hardware, we have two sources of performance counter data:
30 * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
31 * the Pipeline Statistics Registers. We expose both sets of raw data,
32 * as well as some useful processed values.
33 *
34 * The Observability Architecture (OA) counters for Gen6+ are documented
35 * in a separate document from the rest of the PRMs. It is available at:
36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms
37 * => 2013 Intel Core Processor Family => Observability Performance Counters
38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
39 *
40 * On Ironlake, the OA counters were called "CHAPS" counters. Sadly, no public
41 * documentation exists; our implementation is based on the source code for the
42 * intel_perf_counters utility (which is available as part of intel-gpu-tools).
43 */
44
45 #include <limits.h>
46
47 #include "util/bitset.h"
48 #include "main/hash.h"
49 #include "main/macros.h"
50 #include "main/mtypes.h"
51 #include "main/performance_monitor.h"
52
53 #include "util/ralloc.h"
54
55 #include "brw_context.h"
56 #include "brw_defines.h"
57 #include "intel_batchbuffer.h"
58
59 #define FILE_DEBUG_FLAG DEBUG_PERFMON
60
61 /**
62 * i965 representation of a performance monitor object.
63 */
64 struct brw_perf_monitor_object
65 {
66 /** The base class. */
67 struct gl_perf_monitor_object base;
68
69 /**
70 * BO containing OA counter snapshots at monitor Begin/End time.
71 */
72 drm_intel_bo *oa_bo;
73
74 /** Indexes into bookend_bo (snapshot numbers) for various segments. */
75 int oa_head_end;
76 int oa_middle_start;
77 int oa_tail_start;
78
79 /**
80 * Storage for OA results accumulated so far.
81 *
82 * An array indexed by the counter ID in the OA_COUNTERS group.
83 *
84 * When we run out of space in bookend_bo, we compute the results so far
85 * and add them to the value stored here. Then, we can discard bookend_bo.
86 */
87 uint32_t *oa_results;
88
89 /**
90 * BO containing starting and ending snapshots for any active pipeline
91 * statistics counters.
92 */
93 drm_intel_bo *pipeline_stats_bo;
94
95 /**
96 * Storage for final pipeline statistics counter results.
97 */
98 uint64_t *pipeline_stats_results;
99 };
100
101 /** Downcasting convenience macro. */
102 static inline struct brw_perf_monitor_object *
103 brw_perf_monitor(struct gl_perf_monitor_object *m)
104 {
105 return (struct brw_perf_monitor_object *) m;
106 }
107
108 #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
109
110 /* A random value used to ensure we're getting valid snapshots. */
111 #define REPORT_ID 0xd2e9c607
112
113 /******************************************************************************/
114
115 #define COUNTER(name) \
116 { \
117 .Name = name, \
118 .Type = GL_UNSIGNED_INT, \
119 .Minimum = { .u32 = 0 }, \
120 .Maximum = { .u32 = ~0 }, \
121 }
122
123 #define COUNTER64(name) \
124 { \
125 .Name = name, \
126 .Type = GL_UNSIGNED_INT64_AMD, \
127 .Minimum = { .u64 = 0 }, \
128 .Maximum = { .u64 = ~0 }, \
129 }
130
131 #define GROUP(name, max_active, counter_list) \
132 { \
133 .Name = name, \
134 .MaxActiveCounters = max_active, \
135 .Counters = counter_list, \
136 .NumCounters = ARRAY_SIZE(counter_list), \
137 }
138
139 /** Performance Monitor Group IDs */
140 enum brw_counter_groups {
141 OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
142 PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */
143 };
144
145 /**
146 * Ironlake:
147 * @{
148 *
149 * The list of CHAPS counters unfortunately does not appear in any public
150 * documentation, but is available by reading the source code for the
151 * intel_perf_counters utility (shipped as part of intel-gpu-tools).
152 */
153 static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
154 COUNTER("cycles the CS unit is starved"),
155 COUNTER("cycles the CS unit is stalled"),
156 COUNTER("cycles the VF unit is starved"),
157 COUNTER("cycles the VF unit is stalled"),
158 COUNTER("cycles the VS unit is starved"),
159 COUNTER("cycles the VS unit is stalled"),
160 COUNTER("cycles the GS unit is starved"),
161 COUNTER("cycles the GS unit is stalled"),
162 COUNTER("cycles the CL unit is starved"),
163 COUNTER("cycles the CL unit is stalled"),
164 COUNTER("cycles the SF unit is starved"),
165 COUNTER("cycles the SF unit is stalled"),
166 COUNTER("cycles the WZ unit is starved"),
167 COUNTER("cycles the WZ unit is stalled"),
168 COUNTER("Z buffer read/write"),
169 COUNTER("cycles each EU was active"),
170 COUNTER("cycles each EU was suspended"),
171 COUNTER("cycles threads loaded all EUs"),
172 COUNTER("cycles filtering active"),
173 COUNTER("cycles PS threads executed"),
174 COUNTER("subspans written to RC"),
175 COUNTER("bytes read for texture reads"),
176 COUNTER("texels returned from sampler"),
177 COUNTER("polygons not culled"),
178 COUNTER("clocks MASF has valid message"),
179 COUNTER("64b writes/reads from RC"),
180 COUNTER("reads on dataport"),
181 COUNTER("clocks MASF has valid msg not consumed by sampler"),
182 COUNTER("cycles any EU is stalled for math"),
183 };
184
185 static const int gen5_oa_snapshot_layout[] =
186 {
187 -1, /* Report ID */
188 -1, /* TIMESTAMP (64-bit) */
189 -1, /* ...second half... */
190 0, /* cycles the CS unit is starved */
191 1, /* cycles the CS unit is stalled */
192 2, /* cycles the VF unit is starved */
193 3, /* cycles the VF unit is stalled */
194 4, /* cycles the VS unit is starved */
195 5, /* cycles the VS unit is stalled */
196 6, /* cycles the GS unit is starved */
197 7, /* cycles the GS unit is stalled */
198 8, /* cycles the CL unit is starved */
199 9, /* cycles the CL unit is stalled */
200 10, /* cycles the SF unit is starved */
201 11, /* cycles the SF unit is stalled */
202 12, /* cycles the WZ unit is starved */
203 13, /* cycles the WZ unit is stalled */
204 14, /* Z buffer read/write */
205 15, /* cycles each EU was active */
206 16, /* cycles each EU was suspended */
207 17, /* cycles threads loaded all EUs */
208 18, /* cycles filtering active */
209 19, /* cycles PS threads executed */
210 20, /* subspans written to RC */
211 21, /* bytes read for texture reads */
212 22, /* texels returned from sampler */
213 23, /* polygons not culled */
214 24, /* clocks MASF has valid message */
215 25, /* 64b writes/reads from RC */
216 26, /* reads on dataport */
217 27, /* clocks MASF has valid msg not consumed by sampler */
218 28, /* cycles any EU is stalled for math */
219 };
220
221 static const struct gl_perf_monitor_group gen5_groups[] = {
222 [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
223 /* Our pipeline statistics counter handling requires hardware contexts. */
224 };
225 /** @} */
226
227 /**
228 * Sandybridge:
229 * @{
230 *
231 * A few of the counters here (A17-A20) are not included in the latest
232 * documentation, but are described in the Ironlake PRM (which strangely
233 * documents Sandybridge's performance counter system, not Ironlake's).
234 * It's unclear whether they work or not; empirically, they appear to.
235 */
236
237 /**
238 * Aggregating counters A0-A28:
239 */
240 static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
241 /* A0: 0 */ COUNTER("Aggregated Core Array Active"),
242 /* A1: 1 */ COUNTER("Aggregated Core Array Stalled"),
243 /* A2: 2 */ COUNTER("Vertex Shader Active Time"),
244 /* A3: Not actually hooked up on Sandybridge. */
245 /* A4: 3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
246 /* A5: 4 */ COUNTER("# VS threads loaded"),
247 /* A6: 5 */ COUNTER("Vertex Shader Ready but not running Time"),
248 /* A7: 6 */ COUNTER("Geometry Shader Active Time"),
249 /* A8: Not actually hooked up on Sandybridge. */
250 /* A9: 7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
251 /* A10: 8 */ COUNTER("# GS threads loaded"),
252 /* A11: 9 */ COUNTER("Geometry Shader Ready but not running Time"),
253 /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
254 /* A13: Not actually hooked up on Sandybridge. */
255 /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
256 /* A15: 12 */ COUNTER("# PS threads loaded"),
257 /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
258 /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
259 /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
260 /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
261 /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
262 /* A21: 18 */ COUNTER("Pixel Kill Count"),
263 /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
264 /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
265 /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
266 /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
267 /* A26: 23 */ COUNTER("GPU Busy"),
268 /* A27: 24 */ COUNTER("CL active and not stalled"),
269 /* A28: 25 */ COUNTER("SF active and stalled"),
270 };
271
272 /**
273 * Sandybridge: Counter Select = 001
274 * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID
275 * A5 A6 A7 A8 A9 A10 A11 A12
276 * A13 A14 A15 A16 A17 A18 A19 A20
277 * A21 A22 A23 A24 A25 A26 A27 A28
278 *
279 * (Yes, this is a strange order.) We also have to remap for missing counters.
280 */
281 static const int gen6_oa_snapshot_layout[] =
282 {
283 -1, /* Report ID */
284 -1, /* TIMESTAMP (64-bit) */
285 -1, /* ...second half... */
286 3, /* A4: Vertex Shader Stall Time - Core Stall */
287 -1, /* A3: (not available) */
288 2, /* A2: Vertex Shader Active Time */
289 1, /* A1: Aggregated Core Array Stalled */
290 0, /* A0: Aggregated Core Array Active */
291 10, /* A12: Pixel Shader Active Time */
292 9, /* A11: Geometry Shader ready but not running Time */
293 8, /* A10: # GS threads loaded */
294 7, /* A9: Geometry Shader Stall Time - Core Stall */
295 -1, /* A8: (not available) */
296 6, /* A7: Geometry Shader Active Time */
297 5, /* A6: Vertex Shader ready but not running Time */
298 4, /* A5: # VS Threads Loaded */
299 17, /* A20: Early Stencil Test Pixels Failing */
300 16, /* A19: Early Stencil Test Pixels Passing */
301 15, /* A18: Early Z Test Pixels Failing */
302 14, /* A17: Early Z Test Pixels Passing */
303 13, /* A16: Pixel Shader ready but not running Time */
304 12, /* A15: # PS threads loaded */
305 11, /* A14: Pixel Shader Stall Time - Core Stall */
306 -1, /* A13: (not available) */
307 25, /* A28: SF active and stalled */
308 24, /* A27: CL active and not stalled */
309 23, /* A26: GPU Busy */
310 22, /* A25: Pixels/samples Written in the frame buffer */
311 21, /* A24: Post PS Z buffer Pixels Failed */
312 20, /* A23: Post PS Stencil Pixels Failed */
313 19, /* A22: Alpha Test Pixels Failed */
314 18, /* A21: Pixel Kill Count */
315 };
316
317 static const struct gl_perf_monitor_counter gen6_statistics_counters[] = {
318 COUNTER64("IA_VERTICES_COUNT"),
319 COUNTER64("IA_PRIMITIVES_COUNT"),
320 COUNTER64("VS_INVOCATION_COUNT"),
321 COUNTER64("GS_INVOCATION_COUNT"),
322 COUNTER64("GS_PRIMITIVES_COUNT"),
323 COUNTER64("CL_INVOCATION_COUNT"),
324 COUNTER64("CL_PRIMITIVES_COUNT"),
325 COUNTER64("PS_INVOCATION_COUNT"),
326 COUNTER64("PS_DEPTH_COUNT"),
327 COUNTER64("SO_NUM_PRIMS_WRITTEN"),
328 COUNTER64("SO_PRIM_STORAGE_NEEDED"),
329 };
330
331 /** MMIO register addresses for each pipeline statistics counter. */
332 static const int gen6_statistics_register_addresses[] = {
333 IA_VERTICES_COUNT,
334 IA_PRIMITIVES_COUNT,
335 VS_INVOCATION_COUNT,
336 GS_INVOCATION_COUNT,
337 GS_PRIMITIVES_COUNT,
338 CL_INVOCATION_COUNT,
339 CL_PRIMITIVES_COUNT,
340 PS_INVOCATION_COUNT,
341 PS_DEPTH_COUNT,
342 GEN6_SO_NUM_PRIMS_WRITTEN,
343 GEN6_SO_PRIM_STORAGE_NEEDED,
344 };
345
346 static const struct gl_perf_monitor_group gen6_groups[] = {
347 GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
348 GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
349 };
350 /** @} */
351
352 /**
353 * Ivybridge/Baytrail/Haswell:
354 * @{
355 */
356 static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
357 COUNTER("Aggregated Core Array Active"),
358 COUNTER("Aggregated Core Array Stalled"),
359 COUNTER("Vertex Shader Active Time"),
360 COUNTER("Vertex Shader Stall Time - Core Stall"),
361 COUNTER("# VS threads loaded"),
362 COUNTER("Hull Shader Active Time"),
363 COUNTER("Hull Shader Stall Time - Core Stall"),
364 COUNTER("# HS threads loaded"),
365 COUNTER("Domain Shader Active Time"),
366 COUNTER("Domain Shader Stall Time - Core Stall"),
367 COUNTER("# DS threads loaded"),
368 COUNTER("Compute Shader Active Time"),
369 COUNTER("Compute Shader Stall Time - Core Stall"),
370 COUNTER("# CS threads loaded"),
371 COUNTER("Geometry Shader Active Time"),
372 COUNTER("Geometry Shader Stall Time - Core Stall"),
373 COUNTER("# GS threads loaded"),
374 COUNTER("Pixel Shader Active Time"),
375 COUNTER("Pixel Shader Stall Time - Core Stall"),
376 COUNTER("# PS threads loaded"),
377 COUNTER("HiZ Fast Z Test Pixels Passing"),
378 COUNTER("HiZ Fast Z Test Pixels Failing"),
379 COUNTER("Slow Z Test Pixels Passing"),
380 COUNTER("Slow Z Test Pixels Failing"),
381 COUNTER("Pixel Kill Count"),
382 COUNTER("Alpha Test Pixels Failed"),
383 COUNTER("Post PS Stencil Pixels Failed"),
384 COUNTER("Post PS Z buffer Pixels Failed"),
385 COUNTER("3D/GPGPU Render Target Writes"),
386 COUNTER("Render Engine Busy"),
387 COUNTER("VS bottleneck"),
388 COUNTER("GS bottleneck"),
389 };
390
391 /**
392 * Ivybridge/Baytrail/Haswell: Counter Select = 101
393 * A4 A3 A2 A1 A0 TIMESTAMP ReportID
394 * A12 A11 A10 A9 A8 A7 A6 A5
395 * A20 A19 A18 A17 A16 A15 A14 A13
396 * A28 A27 A26 A25 A24 A23 A22 A21
397 * A36 A35 A34 A33 A32 A31 A30 A29
398 * A44 A43 A42 A41 A40 A39 A38 A37
399 * B7 B6 B5 B4 B3 B2 B1 B0
400 * Rsv Rsv Rsv Rsv Rsv Rsv Rsv Rsv
401 */
402 static const int gen7_oa_snapshot_layout[] =
403 {
404 -1, /* Report ID */
405 -1, /* TIMESTAMP (64-bit) */
406 -1, /* ...second half... */
407 0, /* A0: Aggregated Core Array Active */
408 1, /* A1: Aggregated Core Array Stalled */
409 2, /* A2: Vertex Shader Active Time */
410 -1, /* A3: Reserved */
411 3, /* A4: Vertex Shader Stall Time - Core Stall */
412 4, /* A5: # VS threads loaded */
413 -1, /* A6: Reserved */
414 5, /* A7: Hull Shader Active Time */
415 -1, /* A8: Reserved */
416 6, /* A9: Hull Shader Stall Time - Core Stall */
417 7, /* A10: # HS threads loaded */
418 -1, /* A11: Reserved */
419 8, /* A12: Domain Shader Active Time */
420 -1, /* A13: Reserved */
421 9, /* A14: Domain Shader Stall Time - Core Stall */
422 10, /* A15: # DS threads loaded */
423 -1, /* A16: Reserved */
424 11, /* A17: Compute Shader Active Time */
425 -1, /* A18: Reserved */
426 12, /* A19: Compute Shader Stall Time - Core Stall */
427 13, /* A20: # CS threads loaded */
428 -1, /* A21: Reserved */
429 14, /* A22: Geometry Shader Active Time */
430 -1, /* A23: Reserved */
431 15, /* A24: Geometry Shader Stall Time - Core Stall */
432 16, /* A25: # GS threads loaded */
433 -1, /* A26: Reserved */
434 17, /* A27: Pixel Shader Active Time */
435 -1, /* A28: Reserved */
436 18, /* A29: Pixel Shader Stall Time - Core Stall */
437 19, /* A30: # PS threads loaded */
438 -1, /* A31: Reserved */
439 20, /* A32: HiZ Fast Z Test Pixels Passing */
440 21, /* A33: HiZ Fast Z Test Pixels Failing */
441 22, /* A34: Slow Z Test Pixels Passing */
442 23, /* A35: Slow Z Test Pixels Failing */
443 24, /* A36: Pixel Kill Count */
444 25, /* A37: Alpha Test Pixels Failed */
445 26, /* A38: Post PS Stencil Pixels Failed */
446 27, /* A39: Post PS Z buffer Pixels Failed */
447 28, /* A40: 3D/GPGPU Render Target Writes */
448 29, /* A41: Render Engine Busy */
449 30, /* A42: VS bottleneck */
450 31, /* A43: GS bottleneck */
451 -1, /* A44: Reserved */
452 -1, /* B0 */
453 -1, /* B1 */
454 -1, /* B2 */
455 -1, /* B3 */
456 -1, /* B4 */
457 -1, /* B5 */
458 -1, /* B6 */
459 -1, /* B7 */
460 -1, /* Reserved */
461 -1, /* Reserved */
462 -1, /* Reserved */
463 -1, /* Reserved */
464 -1, /* Reserved */
465 -1, /* Reserved */
466 -1, /* Reserved */
467 -1, /* Reserved */
468 };
469
470 static const struct gl_perf_monitor_counter gen7_statistics_counters[] = {
471 COUNTER64("IA_VERTICES_COUNT"),
472 COUNTER64("IA_PRIMITIVES_COUNT"),
473 COUNTER64("VS_INVOCATION_COUNT"),
474 COUNTER64("HS_INVOCATION_COUNT"),
475 COUNTER64("DS_INVOCATION_COUNT"),
476 COUNTER64("GS_INVOCATION_COUNT"),
477 COUNTER64("GS_PRIMITIVES_COUNT"),
478 COUNTER64("CL_INVOCATION_COUNT"),
479 COUNTER64("CL_PRIMITIVES_COUNT"),
480 COUNTER64("PS_INVOCATION_COUNT"),
481 COUNTER64("PS_DEPTH_COUNT"),
482 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
483 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
484 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
485 COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
486 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
487 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
488 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
489 COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
490 };
491
492 /** MMIO register addresses for each pipeline statistics counter. */
493 static const int gen7_statistics_register_addresses[] = {
494 IA_VERTICES_COUNT,
495 IA_PRIMITIVES_COUNT,
496 VS_INVOCATION_COUNT,
497 HS_INVOCATION_COUNT,
498 DS_INVOCATION_COUNT,
499 GS_INVOCATION_COUNT,
500 GS_PRIMITIVES_COUNT,
501 CL_INVOCATION_COUNT,
502 CL_PRIMITIVES_COUNT,
503 PS_INVOCATION_COUNT,
504 PS_DEPTH_COUNT,
505 GEN7_SO_NUM_PRIMS_WRITTEN(0),
506 GEN7_SO_NUM_PRIMS_WRITTEN(1),
507 GEN7_SO_NUM_PRIMS_WRITTEN(2),
508 GEN7_SO_NUM_PRIMS_WRITTEN(3),
509 GEN7_SO_PRIM_STORAGE_NEEDED(0),
510 GEN7_SO_PRIM_STORAGE_NEEDED(1),
511 GEN7_SO_PRIM_STORAGE_NEEDED(2),
512 GEN7_SO_PRIM_STORAGE_NEEDED(3),
513 };
514
515 static const struct gl_perf_monitor_group gen7_groups[] = {
516 GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
517 GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
518 };
519 /** @} */
520
521 /******************************************************************************/
522
523 static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *);
524
525 static void
526 dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
527 {
528 struct brw_context *brw = brw_void;
529 struct gl_context *ctx = brw_void;
530 struct gl_perf_monitor_object *m = monitor_void;
531 struct brw_perf_monitor_object *monitor = monitor_void;
532
533 const char *resolved = "";
534 for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
535 if (brw->perfmon.unresolved[i] == monitor) {
536 resolved = "Unresolved";
537 break;
538 }
539 }
540
541 DBG("%4d %-7s %-6s %-10s %-11s <%3d, %3d, %3d> %-6s %-9s\n",
542 name,
543 m->Active ? "Active" : "",
544 m->Ended ? "Ended" : "",
545 resolved,
546 brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
547 monitor->oa_head_end,
548 monitor->oa_middle_start,
549 monitor->oa_tail_start,
550 monitor->oa_bo ? "OA BO" : "",
551 monitor->pipeline_stats_bo ? "Stats BO" : "");
552 }
553
554 void
555 brw_dump_perf_monitors(struct brw_context *brw)
556 {
557 struct gl_context *ctx = &brw->ctx;
558 DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users);
559 _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw);
560 }
561
562 /******************************************************************************/
563
564 static bool
565 monitor_needs_statistics_registers(struct brw_context *brw,
566 struct gl_perf_monitor_object *m)
567 {
568 return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS];
569 }
570
571 /**
572 * Take a snapshot of any monitored pipeline statistics counters.
573 */
574 static void
575 snapshot_statistics_registers(struct brw_context *brw,
576 struct brw_perf_monitor_object *monitor,
577 uint32_t offset_in_bytes)
578 {
579 struct gl_context *ctx = &brw->ctx;
580 const int offset = offset_in_bytes / sizeof(uint64_t);
581 const int group = PIPELINE_STATS_COUNTERS;
582 const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
583
584 brw_emit_mi_flush(brw);
585
586 for (int i = 0; i < num_counters; i++) {
587 if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
588 assert(ctx->PerfMonitor.Groups[group].Counters[i].Type ==
589 GL_UNSIGNED_INT64_AMD);
590
591 brw_store_register_mem64(brw, monitor->pipeline_stats_bo,
592 brw->perfmon.statistics_registers[i],
593 offset + i);
594 }
595 }
596 }
597
598 /**
599 * Gather results from pipeline_stats_bo, storing the final values.
600 *
601 * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
602 * smaller array of final results.
603 */
604 static void
605 gather_statistics_results(struct brw_context *brw,
606 struct brw_perf_monitor_object *monitor)
607 {
608 struct gl_context *ctx = &brw->ctx;
609 const int num_counters =
610 ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
611
612 monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
613 if (monitor->pipeline_stats_results == NULL) {
614 _mesa_error_no_memory(__func__);
615 return;
616 }
617
618 drm_intel_bo_map(monitor->pipeline_stats_bo, false);
619 uint64_t *start = monitor->pipeline_stats_bo->virtual;
620 uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
621
622 for (int i = 0; i < num_counters; i++) {
623 monitor->pipeline_stats_results[i] = end[i] - start[i];
624 }
625 drm_intel_bo_unmap(monitor->pipeline_stats_bo);
626 drm_intel_bo_unreference(monitor->pipeline_stats_bo);
627 monitor->pipeline_stats_bo = NULL;
628 }
629
630 /******************************************************************************/
631
632 static bool
633 monitor_needs_oa(struct brw_context *brw,
634 struct gl_perf_monitor_object *m)
635 {
636 return m->ActiveGroups[OA_COUNTERS];
637 }
638
639 /**
640 * Enable the Observability Architecture counters by whacking OACONTROL.
641 */
642 static void
643 start_oa_counters(struct brw_context *brw)
644 {
645 unsigned counter_format;
646
647 /* Pick the counter format which gives us all the counters. */
648 switch (brw->gen) {
649 case 5:
650 return; /* Ironlake counters are always running. */
651 case 6:
652 counter_format = 0b001;
653 break;
654 case 7:
655 counter_format = 0b101;
656 break;
657 default:
658 unreachable("Tried to enable OA counters on an unsupported generation.");
659 }
660
661 BEGIN_BATCH(3);
662 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
663 OUT_BATCH(OACONTROL);
664 OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
665 OACONTROL_ENABLE_COUNTERS);
666 ADVANCE_BATCH();
667 }
668
669 /**
670 * Disable OA counters.
671 */
672 static void
673 stop_oa_counters(struct brw_context *brw)
674 {
675 /* Ironlake counters never stop. */
676 if (brw->gen == 5)
677 return;
678
679 BEGIN_BATCH(3);
680 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
681 OUT_BATCH(OACONTROL);
682 OUT_BATCH(0);
683 ADVANCE_BATCH();
684 }
685
686 /**
687 * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
688 * including the required PIPE_CONTROL flushes.
689 *
690 * Sandybridge is the worst case scenario: brw_emit_mi_flush
691 * expands to three PIPE_CONTROLs which are 4 DWords each. We have to flush
692 * before and after MI_REPORT_PERF_COUNT, so multiply by two. Finally, add
693 * the 3 DWords for MI_REPORT_PERF_COUNT itself.
694 */
695 #define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
696
697 /**
698 * Emit an MI_REPORT_PERF_COUNT command packet.
699 *
700 * This writes the current OA counter values to buffer.
701 */
702 static void
703 emit_mi_report_perf_count(struct brw_context *brw,
704 drm_intel_bo *bo,
705 uint32_t offset_in_bytes,
706 uint32_t report_id)
707 {
708 assert(offset_in_bytes % 64 == 0);
709
710 /* Make sure the commands to take a snapshot fits in a single batch. */
711 intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
712 RENDER_RING);
713 int batch_used = USED_BATCH(brw->batch);
714
715 /* Reports apparently don't always get written unless we flush first. */
716 brw_emit_mi_flush(brw);
717
718 if (brw->gen == 5) {
719 /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
720 * the counters. The report ID is ignored in the second set.
721 */
722 BEGIN_BATCH(6);
723 OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
724 OUT_RELOC(bo,
725 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
726 offset_in_bytes);
727 OUT_BATCH(report_id);
728
729 OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
730 OUT_RELOC(bo,
731 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
732 offset_in_bytes + 64);
733 OUT_BATCH(report_id);
734 ADVANCE_BATCH();
735 } else if (brw->gen == 6) {
736 BEGIN_BATCH(3);
737 OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
738 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
739 offset_in_bytes | MI_COUNTER_ADDRESS_GTT);
740 OUT_BATCH(report_id);
741 ADVANCE_BATCH();
742 } else if (brw->gen == 7) {
743 BEGIN_BATCH(3);
744 OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
745 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
746 offset_in_bytes);
747 OUT_BATCH(report_id);
748 ADVANCE_BATCH();
749 } else {
750 unreachable("Unsupported generation for performance counters.");
751 }
752
753 /* Reports apparently don't always get written unless we flush after. */
754 brw_emit_mi_flush(brw);
755
756 (void) batch_used;
757 assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
758 }
759
760 /**
761 * Add a monitor to the global list of "unresolved monitors."
762 *
763 * Monitors are "unresolved" if they refer to OA counter snapshots in
764 * bookend_bo. Results (even partial ones) must be gathered for all
765 * unresolved monitors before it's safe to discard bookend_bo.
766 */
767 static void
768 add_to_unresolved_monitor_list(struct brw_context *brw,
769 struct brw_perf_monitor_object *monitor)
770 {
771 if (brw->perfmon.unresolved_elements >=
772 brw->perfmon.unresolved_array_size) {
773 brw->perfmon.unresolved_array_size *= 2;
774 brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved,
775 struct brw_perf_monitor_object *,
776 brw->perfmon.unresolved_array_size);
777 }
778
779 brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor;
780 }
781
782 /**
783 * If possible, throw away the contents of bookend BO.
784 *
785 * When all monitoring stops, and no monitors need data from bookend_bo to
786 * compute results, we can discard it and start writing snapshots at the
787 * beginning again. This helps reduce the amount of buffer wraparound.
788 */
789 static void
790 clean_bookend_bo(struct brw_context *brw)
791 {
792 if (brw->perfmon.unresolved_elements == 0) {
793 DBG("***Resetting bookend snapshots to 0\n");
794 brw->perfmon.bookend_snapshots = 0;
795 }
796 }
797
798 /**
799 * Remove a monitor from the global list of "unresolved monitors."
800 *
801 * This can happen when:
802 * - We finish computing a completed monitor's results.
803 * - We discard unwanted monitor results.
804 * - A monitor's results can be computed without relying on bookend_bo.
805 */
806 static void
807 drop_from_unresolved_monitor_list(struct brw_context *brw,
808 struct brw_perf_monitor_object *monitor)
809 {
810 for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
811 if (brw->perfmon.unresolved[i] == monitor) {
812 int last_elt = --brw->perfmon.unresolved_elements;
813
814 if (i == last_elt) {
815 brw->perfmon.unresolved[i] = NULL;
816 } else {
817 brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt];
818 }
819
820 clean_bookend_bo(brw);
821 return;
822 }
823 }
824 }
825
826 /**
827 * Given pointers to starting and ending OA snapshots, add the deltas for each
828 * counter to the results.
829 */
830 static void
831 add_deltas(struct brw_context *brw,
832 struct brw_perf_monitor_object *monitor,
833 uint32_t *start, uint32_t *end)
834 {
835 /* Look for expected report ID values to ensure data is present. */
836 assert(start[0] == REPORT_ID);
837 assert(end[0] == REPORT_ID);
838
839 /* Subtract each counter's ending and starting values, then add the
840 * difference to the counter's value so far.
841 */
842 for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) {
843 /* When debugging, it's useful to note when the ending value is less than
844 * the starting value; aggregating counters should always increase in
845 * value (or remain unchanged). This happens periodically due to
846 * wraparound, but can also indicate serious problems.
847 */
848 #ifdef DEBUG
849 if (end[i] < start[i]) {
850 int counter = brw->perfmon.oa_snapshot_layout[i];
851 if (counter >= 0) {
852 DBG("WARNING: \"%s\" ending value was less than the starting "
853 "value: %u < %u (end - start = %u)\n",
854 brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name,
855 end[i], start[i], end[i] - start[i]);
856 }
857 }
858 #endif
859 monitor->oa_results[i] += end[i] - start[i];
860 }
861 }
862
863 /**
864 * Gather OA counter results (partial or full) from a series of snapshots.
865 *
866 * Monitoring can start or stop at any time, likely at some point mid-batch.
867 * We write snapshots for both events, storing them in monitor->oa_bo.
868 *
869 * Ideally, we would simply subtract those two snapshots to obtain the final
870 * counter results. Unfortunately, our hardware doesn't preserve their values
871 * across context switches or GPU sleep states. In order to support multiple
872 * concurrent OA clients, as well as reliable data across power management,
873 * we have to take snapshots at the start and end of batches as well.
874 *
875 * This results in a three-part sequence of (start, end) intervals:
876 * - The "head" is from the BeginPerfMonitor snapshot to the end of the first
877 * batchbuffer.
878 * - The "middle" is a series of (batch start, batch end) snapshots which
879 * bookend any batchbuffers between the ones which start/end monitoring.
880 * - The "tail" is from the start of the last batch where monitoring was
881 * active to the EndPerfMonitor snapshot.
882 *
883 * Due to wrapping in the bookend BO, we may have to accumulate partial results.
884 * If so, we handle the "head" and any "middle" results so far. When monitoring
885 * eventually ends, we handle additional "middle" batches and the "tail."
886 */
887 static void
888 gather_oa_results(struct brw_context *brw,
889 struct brw_perf_monitor_object *monitor,
890 uint32_t *bookend_buffer)
891 {
892 struct gl_perf_monitor_object *m = &monitor->base;
893 assert(monitor->oa_bo != NULL);
894
895 drm_intel_bo_map(monitor->oa_bo, false);
896 uint32_t *monitor_buffer = monitor->oa_bo->virtual;
897
898 /* If monitoring was entirely contained within a single batch, then the
899 * bookend BO is irrelevant. Just subtract monitor->bo's two snapshots.
900 */
901 if (monitor->oa_middle_start == -1) {
902 add_deltas(brw, monitor,
903 monitor_buffer,
904 monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
905 sizeof(uint32_t)));
906 drm_intel_bo_unmap(monitor->oa_bo);
907 return;
908 }
909
910 const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot;
911
912 /* First, add the contributions from the "head" interval:
913 * (snapshot taken at BeginPerfMonitor time,
914 * snapshot taken at the end of the first batch after monitoring began)
915 */
916 if (monitor->oa_head_end != -1) {
917 assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots);
918 add_deltas(brw, monitor,
919 monitor_buffer,
920 bookend_buffer + snapshot_size * monitor->oa_head_end);
921
922 /* Make sure we don't count the "head" again in the future. */
923 monitor->oa_head_end = -1;
924 }
925
926 /* Next, count the contributions from the "middle" batches. These are
927 * (batch begin, batch end) deltas while monitoring was active.
928 */
929 int last_snapshot;
930 if (m->Ended)
931 last_snapshot = monitor->oa_tail_start;
932 else
933 last_snapshot = brw->perfmon.bookend_snapshots;
934
935 for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) {
936 add_deltas(brw, monitor,
937 bookend_buffer + snapshot_size * s,
938 bookend_buffer + snapshot_size * (s + 1));
939 }
940
941 /* Finally, if the monitor has ended, we need to count the contributions of
942 * the "tail" interval:
943 * (start of the batch where monitoring ended, EndPerfMonitor snapshot)
944 */
945 if (m->Ended) {
946 assert(monitor->oa_tail_start != -1);
947 add_deltas(brw, monitor,
948 bookend_buffer + snapshot_size * monitor->oa_tail_start,
949 monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
950 sizeof(uint32_t)));
951 }
952
953 drm_intel_bo_unmap(monitor->oa_bo);
954
955 /* If the monitor has ended, then we've gathered all the results, and
956 * can free the monitor's OA BO.
957 */
958 if (m->Ended) {
959 drm_intel_bo_unreference(monitor->oa_bo);
960 monitor->oa_bo = NULL;
961
962 /* The monitor's OA result is now resolved. */
963 DBG("Marking %d resolved - results gathered\n", m->Name);
964 drop_from_unresolved_monitor_list(brw, monitor);
965 }
966 }
967
968 /**
969 * Handle running out of space in the bookend BO.
970 *
971 * When we run out of space in the bookend BO, we need to gather up partial
972 * results for every unresolved monitor. This allows us to free the snapshot
973 * data in bookend_bo, freeing up the space for reuse. We call this "wrapping."
974 *
975 * This will completely compute the result for any unresolved monitors that
976 * have ended.
977 */
978 static void
979 wrap_bookend_bo(struct brw_context *brw)
980 {
981 DBG("****Wrap bookend BO****\n");
982 /* Note that wrapping will only occur at the start of a batch, since that's
983 * where we reserve space. So the current batch won't reference bookend_bo
984 * or any monitor BOs. This means we don't need to worry about
985 * synchronization.
986 *
987 * Also, EndPerfMonitor guarantees that only monitors which span multiple
988 * batches exist in the unresolved monitor list.
989 */
990 assert(brw->perfmon.oa_users > 0);
991
992 drm_intel_bo_map(brw->perfmon.bookend_bo, false);
993 uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
994 for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
995 struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
996 struct gl_perf_monitor_object *m = &monitor->base;
997
998 gather_oa_results(brw, monitor, bookend_buffer);
999
1000 if (m->Ended) {
1001 /* gather_oa_results() dropped the monitor from the unresolved list,
1002 * throwing our indices off by one.
1003 */
1004 --i;
1005 } else {
1006 /* When we create the new bookend_bo, snapshot #0 will be the
1007 * beginning of another "middle" BO.
1008 */
1009 monitor->oa_middle_start = 0;
1010 assert(monitor->oa_head_end == -1);
1011 assert(monitor->oa_tail_start == -1);
1012 }
1013 }
1014 drm_intel_bo_unmap(brw->perfmon.bookend_bo);
1015
1016 brw->perfmon.bookend_snapshots = 0;
1017 }
1018
1019 /* This is fairly arbitrary; the trade off is memory usage vs. extra overhead
1020 * from wrapping. On Gen7, 32768 should be enough for for 128 snapshots before
1021 * wrapping (since each is 256 bytes).
1022 */
1023 #define BOOKEND_BO_SIZE_BYTES 32768
1024
1025 /**
1026 * Check whether bookend_bo has space for a given number of snapshots.
1027 */
1028 static bool
1029 has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots)
1030 {
1031 int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
1032
1033 /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */
1034 int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots;
1035
1036 return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES;
1037 }
1038
1039 /**
1040 * Write an OA counter snapshot to bookend_bo.
1041 */
1042 static void
1043 emit_bookend_snapshot(struct brw_context *brw)
1044 {
1045 int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
1046 int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes;
1047
1048 emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes,
1049 REPORT_ID);
1050 ++brw->perfmon.bookend_snapshots;
1051 }
1052
1053 /******************************************************************************/
1054
1055 /**
1056 * Initialize a monitor to sane starting state; throw away old buffers.
1057 */
1058 static void
1059 reinitialize_perf_monitor(struct brw_context *brw,
1060 struct brw_perf_monitor_object *monitor)
1061 {
1062 if (monitor->oa_bo) {
1063 drm_intel_bo_unreference(monitor->oa_bo);
1064 monitor->oa_bo = NULL;
1065 }
1066
1067 /* Since the results are now invalid, we don't need to hold on to any
1068 * snapshots in bookend_bo. The monitor is effectively "resolved."
1069 */
1070 drop_from_unresolved_monitor_list(brw, monitor);
1071
1072 monitor->oa_head_end = -1;
1073 monitor->oa_middle_start = -1;
1074 monitor->oa_tail_start = -1;
1075
1076 free(monitor->oa_results);
1077 monitor->oa_results = NULL;
1078
1079 if (monitor->pipeline_stats_bo) {
1080 drm_intel_bo_unreference(monitor->pipeline_stats_bo);
1081 monitor->pipeline_stats_bo = NULL;
1082 }
1083
1084 free(monitor->pipeline_stats_results);
1085 monitor->pipeline_stats_results = NULL;
1086 }
1087
1088 /**
1089 * Driver hook for glBeginPerformanceMonitorAMD().
1090 */
1091 static GLboolean
1092 brw_begin_perf_monitor(struct gl_context *ctx,
1093 struct gl_perf_monitor_object *m)
1094 {
1095 struct brw_context *brw = brw_context(ctx);
1096 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1097
1098 DBG("Begin(%d)\n", m->Name);
1099
1100 reinitialize_perf_monitor(brw, monitor);
1101
1102 if (monitor_needs_oa(brw, m)) {
1103 /* If the global OA bookend BO doesn't exist, allocate it. This should
1104 * only happen once, but we delay until BeginPerfMonitor time to avoid
1105 * wasting memory for contexts that don't use performance monitors.
1106 */
1107 if (!brw->perfmon.bookend_bo) {
1108 brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
1109 "OA bookend BO",
1110 BOOKEND_BO_SIZE_BYTES, 64);
1111 }
1112
1113 monitor->oa_bo =
1114 drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
1115 #ifdef DEBUG
1116 /* Pre-filling the BO helps debug whether writes landed. */
1117 drm_intel_bo_map(monitor->oa_bo, true);
1118 memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
1119 drm_intel_bo_unmap(monitor->oa_bo);
1120 #endif
1121
1122 /* Allocate storage for accumulated OA counter values. */
1123 monitor->oa_results =
1124 calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
1125
1126 /* If the OA counters aren't already on, enable them. */
1127 if (brw->perfmon.oa_users == 0) {
1128 /* Ensure the OACONTROL enable and snapshot land in the same batch. */
1129 int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
1130 intel_batchbuffer_require_space(brw, space, RENDER_RING);
1131 start_oa_counters(brw);
1132 }
1133
1134 /* Take a starting OA counter snapshot. */
1135 emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
1136
1137 monitor->oa_head_end = brw->perfmon.bookend_snapshots;
1138 monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
1139 monitor->oa_tail_start = -1;
1140
1141 /* Add the monitor to the unresolved list. */
1142 add_to_unresolved_monitor_list(brw, monitor);
1143
1144 ++brw->perfmon.oa_users;
1145 }
1146
1147 if (monitor_needs_statistics_registers(brw, m)) {
1148 monitor->pipeline_stats_bo =
1149 drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
1150
1151 /* Take starting snapshots. */
1152 snapshot_statistics_registers(brw, monitor, 0);
1153 }
1154
1155 return true;
1156 }
1157
1158 /**
1159 * Driver hook for glEndPerformanceMonitorAMD().
1160 */
1161 static void
1162 brw_end_perf_monitor(struct gl_context *ctx,
1163 struct gl_perf_monitor_object *m)
1164 {
1165 struct brw_context *brw = brw_context(ctx);
1166 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1167
1168 DBG("End(%d)\n", m->Name);
1169
1170 if (monitor_needs_oa(brw, m)) {
1171 /* Take an ending OA counter snapshot. */
1172 emit_mi_report_perf_count(brw, monitor->oa_bo,
1173 SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID);
1174
1175 --brw->perfmon.oa_users;
1176
1177 if (brw->perfmon.oa_users == 0)
1178 stop_oa_counters(brw);
1179
1180 if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) {
1181 assert(monitor->oa_head_end != -1);
1182 /* We never actually wrote the snapshot for the end of the first batch
1183 * after BeginPerfMonitor. This means that monitoring was contained
1184 * entirely within a single batch, so we can ignore bookend_bo and
1185 * just compare the monitor's begin/end snapshots directly.
1186 */
1187 monitor->oa_head_end = -1;
1188 monitor->oa_middle_start = -1;
1189 monitor->oa_tail_start = -1;
1190
1191 /* We can also mark it resolved since it won't depend on bookend_bo. */
1192 DBG("Marking %d resolved - entirely in one batch\n", m->Name);
1193 drop_from_unresolved_monitor_list(brw, monitor);
1194 } else {
1195 /* We've written at least one batch end snapshot, so the monitoring
1196 * spanned multiple batches. Mark which snapshot corresponds to the
1197 * start of the current batch.
1198 */
1199 monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1;
1200 }
1201 }
1202
1203 if (monitor_needs_statistics_registers(brw, m)) {
1204 /* Take ending snapshots. */
1205 snapshot_statistics_registers(brw, monitor,
1206 SECOND_SNAPSHOT_OFFSET_IN_BYTES);
1207 }
1208 }
1209
1210 /**
1211 * Reset a performance monitor, throwing away any results.
1212 */
1213 static void
1214 brw_reset_perf_monitor(struct gl_context *ctx,
1215 struct gl_perf_monitor_object *m)
1216 {
1217 struct brw_context *brw = brw_context(ctx);
1218 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1219
1220 reinitialize_perf_monitor(brw, monitor);
1221
1222 if (m->Active) {
1223 brw_begin_perf_monitor(ctx, m);
1224 }
1225 }
1226
1227 /**
1228 * Is a performance monitor result available?
1229 */
1230 static GLboolean
1231 brw_is_perf_monitor_result_available(struct gl_context *ctx,
1232 struct gl_perf_monitor_object *m)
1233 {
1234 struct brw_context *brw = brw_context(ctx);
1235 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1236
1237 bool oa_available = true;
1238 bool stats_available = true;
1239
1240 if (monitor_needs_oa(brw, m)) {
1241 oa_available = !monitor->oa_bo ||
1242 (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
1243 !drm_intel_bo_busy(monitor->oa_bo));
1244 }
1245
1246 if (monitor_needs_statistics_registers(brw, m)) {
1247 stats_available = !monitor->pipeline_stats_bo ||
1248 (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
1249 !drm_intel_bo_busy(monitor->pipeline_stats_bo));
1250 }
1251
1252 return oa_available && stats_available;
1253 }
1254
1255 /**
1256 * Get the performance monitor result.
1257 */
1258 static void
1259 brw_get_perf_monitor_result(struct gl_context *ctx,
1260 struct gl_perf_monitor_object *m,
1261 GLsizei data_size,
1262 GLuint *data,
1263 GLint *bytes_written)
1264 {
1265 struct brw_context *brw = brw_context(ctx);
1266 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1267 const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size);
1268
1269 DBG("GetResult(%d)\n", m->Name);
1270 brw_dump_perf_monitors(brw);
1271
1272 /* This hook should only be called when results are available. */
1273 assert(m->Ended);
1274
1275 /* Copy data to the supplied array (data).
1276 *
1277 * The output data format is: <group ID, counter ID, value> for each
1278 * active counter. The API allows counters to appear in any order.
1279 */
1280 GLsizei offset = 0;
1281
1282 if (monitor_needs_oa(brw, m)) {
1283 /* Gather up the results from the BO, unless we already did due to the
1284 * bookend BO wrapping.
1285 */
1286 if (monitor->oa_bo) {
1287 /* Since the result is available, all the necessary snapshots will
1288 * have been written to the bookend BO. If other monitors are
1289 * active, the bookend BO may be busy or referenced by the current
1290 * batch, but only for writing snapshots beyond oa_tail_start,
1291 * which we don't care about.
1292 *
1293 * Using an unsynchronized mapping avoids stalling for an
1294 * indeterminate amount of time.
1295 */
1296 drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
1297
1298 gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
1299
1300 drm_intel_bo_unmap(brw->perfmon.bookend_bo);
1301 }
1302
1303 for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
1304 int group = OA_COUNTERS;
1305 int counter = brw->perfmon.oa_snapshot_layout[i];
1306
1307 /* We always capture all the OA counters, but the application may
1308 * have only asked for a subset. Skip unwanted counters.
1309 */
1310 if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
1311 continue;
1312
1313 if (data + offset + 3 <= data_end) {
1314 data[offset++] = group;
1315 data[offset++] = counter;
1316 data[offset++] = monitor->oa_results[i];
1317 }
1318 }
1319
1320 clean_bookend_bo(brw);
1321 }
1322
1323 if (monitor_needs_statistics_registers(brw, m)) {
1324 const int num_counters =
1325 ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
1326
1327 if (!monitor->pipeline_stats_results) {
1328 gather_statistics_results(brw, monitor);
1329
1330 /* Check if we did really get the results */
1331 if (!monitor->pipeline_stats_results) {
1332 if (bytes_written) {
1333 *bytes_written = 0;
1334 }
1335 return;
1336 }
1337 }
1338
1339 for (int i = 0; i < num_counters; i++) {
1340 if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
1341 if (data + offset + 4 <= data_end) {
1342 data[offset++] = PIPELINE_STATS_COUNTERS;
1343 data[offset++] = i;
1344 *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
1345 offset += 2;
1346 }
1347 }
1348 }
1349 }
1350
1351 if (bytes_written)
1352 *bytes_written = offset * sizeof(uint32_t);
1353 }
1354
1355 /**
1356 * Create a new performance monitor object.
1357 */
1358 static struct gl_perf_monitor_object *
1359 brw_new_perf_monitor(struct gl_context *ctx)
1360 {
1361 (void) ctx;
1362 return calloc(1, sizeof(struct brw_perf_monitor_object));
1363 }
1364
1365 /**
1366 * Delete a performance monitor object.
1367 */
1368 static void
1369 brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
1370 {
1371 struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1372 DBG("Delete(%d)\n", m->Name);
1373 reinitialize_perf_monitor(brw_context(ctx), monitor);
1374 free(monitor);
1375 }
1376
1377 /******************************************************************************/
1378
1379 /**
1380 * Called at the start of every render ring batch.
1381 *
1382 * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot.
1383 * Since it's a new batch, there will be plenty of space for the commands.
1384 */
1385 void
1386 brw_perf_monitor_new_batch(struct brw_context *brw)
1387 {
1388 assert(brw->batch.ring == RENDER_RING);
1389 assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
1390
1391 if (brw->perfmon.oa_users == 0)
1392 return;
1393
1394 start_oa_counters(brw);
1395
1396 /* Make sure bookend_bo has enough space for a pair of snapshots.
1397 * If not, "wrap" the BO: gather up any results so far, and start from
1398 * the beginning of the buffer. Reserving a pair guarantees that wrapping
1399 * will only happen at the beginning of a batch, where it's safe to map BOs
1400 * (as the batch is empty and can't refer to any of them yet).
1401 */
1402 if (!has_space_for_bookend_snapshots(brw, 2))
1403 wrap_bookend_bo(brw);
1404
1405 DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
1406 emit_bookend_snapshot(brw);
1407 }
1408
1409 /**
1410 * Called at the end of every render ring batch.
1411 *
1412 * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters.
1413 *
1414 * This relies on there being enough space in BATCH_RESERVED.
1415 */
1416 void
1417 brw_perf_monitor_finish_batch(struct brw_context *brw)
1418 {
1419 assert(brw->batch.ring == RENDER_RING);
1420
1421 if (brw->perfmon.oa_users == 0)
1422 return;
1423
1424 DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
1425
1426 /* Not safe to wrap; should've reserved space already. */
1427 assert(has_space_for_bookend_snapshots(brw, 1));
1428
1429 emit_bookend_snapshot(brw);
1430
1431 stop_oa_counters(brw);
1432 }
1433
1434 /******************************************************************************/
1435
1436 void
1437 brw_init_performance_monitors(struct brw_context *brw)
1438 {
1439 struct gl_context *ctx = &brw->ctx;
1440
1441 ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
1442 ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
1443 ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
1444 ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
1445 ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
1446 ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
1447 ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
1448
1449 if (brw->gen == 5) {
1450 ctx->PerfMonitor.Groups = gen5_groups;
1451 ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
1452 brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout;
1453 brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout);
1454 } else if (brw->gen == 6) {
1455 ctx->PerfMonitor.Groups = gen6_groups;
1456 ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups);
1457 brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout;
1458 brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout);
1459 brw->perfmon.statistics_registers = gen6_statistics_register_addresses;
1460 } else if (brw->gen == 7) {
1461 ctx->PerfMonitor.Groups = gen7_groups;
1462 ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups);
1463 brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout;
1464 brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
1465 brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
1466 }
1467
1468 brw->perfmon.unresolved =
1469 ralloc_array(brw, struct brw_perf_monitor_object *, 1);
1470 brw->perfmon.unresolved_elements = 0;
1471 brw->perfmon.unresolved_array_size = 1;
1472 }