src/mesa/drivers/dri/i965/brw_performance_monitor.c

   1 /*
   2  * Copyright © 2013 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file brw_performance_monitor.c
  26  *
  27  * Implementation of the GL_AMD_performance_monitor extension.
  28  *
  29  * On Gen5+ hardware, we have two sources of performance counter data:
  30  * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
  31  * the Pipeline Statistics Registers.  We expose both sets of raw data,
  32  * as well as some useful processed values.
  33  *
  34  * The Observability Architecture (OA) counters for Gen6+ are documented
  35  * in a separate document from the rest of the PRMs.  It is available at:
  36  * https://01.org/linuxgraphics/documentation/driver-documentation-prms
  37  * => 2013 Intel Core Processor Family => Observability Performance Counters
  38  * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
  39  *
  40  * On Ironlake, the OA counters were called "CHAPS" counters.  Sadly, no public
  41  * documentation exists; our implementation is based on the source code for the
  42  * intel_perf_counters utility (which is available as part of intel-gpu-tools).
  43  */
  44
  45 #include <limits.h>
  46
  47 #include "util/bitset.h"
  48 #include "main/hash.h"
  49 #include "main/macros.h"
  50 #include "main/mtypes.h"
  51 #include "main/performance_monitor.h"
  52
  53 #include "util/ralloc.h"
  54
  55 #include "brw_context.h"
  56 #include "brw_defines.h"
  57 #include "intel_batchbuffer.h"
  58
  59 #define FILE_DEBUG_FLAG DEBUG_PERFMON
  60
  61 /**
  62  * i965 representation of a performance monitor object.
  63  */
  64 struct brw_perf_monitor_object
  65 {
  66    /** The base class. */
  67    struct gl_perf_monitor_object base;
  68
  69    /**
  70     * BO containing OA counter snapshots at monitor Begin/End time.
  71     */
  72    drm_intel_bo *oa_bo;
  73
  74    /** Indexes into bookend_bo (snapshot numbers) for various segments. */
  75    int oa_head_end;
  76    int oa_middle_start;
  77    int oa_tail_start;
  78
  79    /**
  80     * Storage for OA results accumulated so far.
  81     *
  82     * An array indexed by the counter ID in the OA_COUNTERS group.
  83     *
  84     * When we run out of space in bookend_bo, we compute the results so far
  85     * and add them to the value stored here.  Then, we can discard bookend_bo.
  86     */
  87    uint32_t *oa_results;
  88
  89    /**
  90     * BO containing starting and ending snapshots for any active pipeline
  91     * statistics counters.
  92     */
  93    drm_intel_bo *pipeline_stats_bo;
  94
  95    /**
  96     * Storage for final pipeline statistics counter results.
  97     */
  98    uint64_t *pipeline_stats_results;
  99 };
 100
 101 /** Downcasting convenience macro. */
 102 static inline struct brw_perf_monitor_object *
 103 brw_perf_monitor(struct gl_perf_monitor_object *m)
 104 {
 105    return (struct brw_perf_monitor_object *) m;
 106 }
 107
 108 #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
 109
 110 /* A random value used to ensure we're getting valid snapshots. */
 111 #define REPORT_ID 0xd2e9c607
 112
 113 /******************************************************************************/
 114
 115 #define COUNTER(name)           \
 116    {                            \
 117       .Name = name,             \
 118       .Type = GL_UNSIGNED_INT,  \
 119       .Minimum = { .u32 =  0 }, \
 120       .Maximum = { .u32 = ~0 }, \
 121    }
 122
 123 #define COUNTER64(name)              \
 124    {                                 \
 125       .Name = name,                  \
 126       .Type = GL_UNSIGNED_INT64_AMD, \
 127       .Minimum = { .u64 =  0 },      \
 128       .Maximum = { .u64 = ~0 },      \
 129    }
 130
 131 #define GROUP(name, max_active, counter_list)  \
 132    {                                           \
 133       .Name = name,                            \
 134       .MaxActiveCounters = max_active,         \
 135       .Counters = counter_list,                \
 136       .NumCounters = ARRAY_SIZE(counter_list), \
 137    }
 138
 139 /** Performance Monitor Group IDs */
 140 enum brw_counter_groups {
 141    OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
 142    PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */
 143 };
 144
 145 /**
 146  * Ironlake:
 147  *  @{
 148  *
 149  * The list of CHAPS counters unfortunately does not appear in any public
 150  * documentation, but is available by reading the source code for the
 151  * intel_perf_counters utility (shipped as part of intel-gpu-tools).
 152  */
 153 static const struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
 154    COUNTER("cycles the CS unit is starved"),
 155    COUNTER("cycles the CS unit is stalled"),
 156    COUNTER("cycles the VF unit is starved"),
 157    COUNTER("cycles the VF unit is stalled"),
 158    COUNTER("cycles the VS unit is starved"),
 159    COUNTER("cycles the VS unit is stalled"),
 160    COUNTER("cycles the GS unit is starved"),
 161    COUNTER("cycles the GS unit is stalled"),
 162    COUNTER("cycles the CL unit is starved"),
 163    COUNTER("cycles the CL unit is stalled"),
 164    COUNTER("cycles the SF unit is starved"),
 165    COUNTER("cycles the SF unit is stalled"),
 166    COUNTER("cycles the WZ unit is starved"),
 167    COUNTER("cycles the WZ unit is stalled"),
 168    COUNTER("Z buffer read/write"),
 169    COUNTER("cycles each EU was active"),
 170    COUNTER("cycles each EU was suspended"),
 171    COUNTER("cycles threads loaded all EUs"),
 172    COUNTER("cycles filtering active"),
 173    COUNTER("cycles PS threads executed"),
 174    COUNTER("subspans written to RC"),
 175    COUNTER("bytes read for texture reads"),
 176    COUNTER("texels returned from sampler"),
 177    COUNTER("polygons not culled"),
 178    COUNTER("clocks MASF has valid message"),
 179    COUNTER("64b writes/reads from RC"),
 180    COUNTER("reads on dataport"),
 181    COUNTER("clocks MASF has valid msg not consumed by sampler"),
 182    COUNTER("cycles any EU is stalled for math"),
 183 };
 184
 185 static const int gen5_oa_snapshot_layout[] =
 186 {
 187    -1, /* Report ID */
 188    -1, /* TIMESTAMP (64-bit) */
 189    -1, /* ...second half... */
 190     0, /* cycles the CS unit is starved */
 191     1, /* cycles the CS unit is stalled */
 192     2, /* cycles the VF unit is starved */
 193     3, /* cycles the VF unit is stalled */
 194     4, /* cycles the VS unit is starved */
 195     5, /* cycles the VS unit is stalled */
 196     6, /* cycles the GS unit is starved */
 197     7, /* cycles the GS unit is stalled */
 198     8, /* cycles the CL unit is starved */
 199     9, /* cycles the CL unit is stalled */
 200    10, /* cycles the SF unit is starved */
 201    11, /* cycles the SF unit is stalled */
 202    12, /* cycles the WZ unit is starved */
 203    13, /* cycles the WZ unit is stalled */
 204    14, /* Z buffer read/write */
 205    15, /* cycles each EU was active */
 206    16, /* cycles each EU was suspended */
 207    17, /* cycles threads loaded all EUs */
 208    18, /* cycles filtering active */
 209    19, /* cycles PS threads executed */
 210    20, /* subspans written to RC */
 211    21, /* bytes read for texture reads */
 212    22, /* texels returned from sampler */
 213    23, /* polygons not culled */
 214    24, /* clocks MASF has valid message */
 215    25, /* 64b writes/reads from RC */
 216    26, /* reads on dataport */
 217    27, /* clocks MASF has valid msg not consumed by sampler */
 218    28, /* cycles any EU is stalled for math */
 219 };
 220
 221 static const struct gl_perf_monitor_group gen5_groups[] = {
 222    [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
 223    /* Our pipeline statistics counter handling requires hardware contexts. */
 224 };
 225 /** @} */
 226
 227 /**
 228  * Sandybridge:
 229  *  @{
 230  *
 231  * A few of the counters here (A17-A20) are not included in the latest
 232  * documentation, but are described in the Ironlake PRM (which strangely
 233  * documents Sandybridge's performance counter system, not Ironlake's).
 234  * It's unclear whether they work or not; empirically, they appear to.
 235  */
 236
 237 /**
 238  * Aggregating counters A0-A28:
 239  */
 240 static const struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
 241    /* A0:   0 */ COUNTER("Aggregated Core Array Active"),
 242    /* A1:   1 */ COUNTER("Aggregated Core Array Stalled"),
 243    /* A2:   2 */ COUNTER("Vertex Shader Active Time"),
 244    /* A3: Not actually hooked up on Sandybridge. */
 245    /* A4:   3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
 246    /* A5:   4 */ COUNTER("# VS threads loaded"),
 247    /* A6:   5 */ COUNTER("Vertex Shader Ready but not running Time"),
 248    /* A7:   6 */ COUNTER("Geometry Shader Active Time"),
 249    /* A8: Not actually hooked up on Sandybridge. */
 250    /* A9:   7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
 251    /* A10:  8 */ COUNTER("# GS threads loaded"),
 252    /* A11:  9 */ COUNTER("Geometry Shader Ready but not running Time"),
 253    /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
 254    /* A13: Not actually hooked up on Sandybridge. */
 255    /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
 256    /* A15: 12 */ COUNTER("# PS threads loaded"),
 257    /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
 258    /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
 259    /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
 260    /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
 261    /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
 262    /* A21: 18 */ COUNTER("Pixel Kill Count"),
 263    /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
 264    /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
 265    /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
 266    /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
 267    /* A26: 23 */ COUNTER("GPU Busy"),
 268    /* A27: 24 */ COUNTER("CL active and not stalled"),
 269    /* A28: 25 */ COUNTER("SF active and stalled"),
 270 };
 271
 272 /**
 273  * Sandybridge: Counter Select = 001
 274  * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
 275  * A5   A6   A7   A8   A9   A10  A11  A12
 276  * A13  A14  A15  A16  A17  A18  A19  A20
 277  * A21  A22  A23  A24  A25  A26  A27  A28
 278  *
 279  * (Yes, this is a strange order.)  We also have to remap for missing counters.
 280  */
 281 static const int gen6_oa_snapshot_layout[] =
 282 {
 283    -1, /* Report ID */
 284    -1, /* TIMESTAMP (64-bit) */
 285    -1, /* ...second half... */
 286     3, /* A4:  Vertex Shader Stall Time - Core Stall */
 287    -1, /* A3:  (not available) */
 288     2, /* A2:  Vertex Shader Active Time */
 289     1, /* A1:  Aggregated Core Array Stalled */
 290     0, /* A0:  Aggregated Core Array Active */
 291    10, /* A12: Pixel Shader Active Time */
 292     9, /* A11: Geometry Shader ready but not running Time */
 293     8, /* A10: # GS threads loaded */
 294     7, /* A9:  Geometry Shader Stall Time - Core Stall */
 295    -1, /* A8:  (not available) */
 296     6, /* A7:  Geometry Shader Active Time */
 297     5, /* A6:  Vertex Shader ready but not running Time */
 298     4, /* A5:  # VS Threads Loaded */
 299    17, /* A20: Early Stencil Test Pixels Failing */
 300    16, /* A19: Early Stencil Test Pixels Passing */
 301    15, /* A18: Early Z Test Pixels Failing */
 302    14, /* A17: Early Z Test Pixels Passing */
 303    13, /* A16: Pixel Shader ready but not running Time */
 304    12, /* A15: # PS threads loaded */
 305    11, /* A14: Pixel Shader Stall Time - Core Stall */
 306    -1, /* A13: (not available) */
 307    25, /* A28: SF active and stalled */
 308    24, /* A27: CL active and not stalled */
 309    23, /* A26: GPU Busy */
 310    22, /* A25: Pixels/samples Written in the frame buffer */
 311    21, /* A24: Post PS Z buffer Pixels Failed */
 312    20, /* A23: Post PS Stencil Pixels Failed */
 313    19, /* A22: Alpha Test Pixels Failed */
 314    18, /* A21: Pixel Kill Count */
 315 };
 316
 317 static const struct gl_perf_monitor_counter gen6_statistics_counters[] = {
 318    COUNTER64("IA_VERTICES_COUNT"),
 319    COUNTER64("IA_PRIMITIVES_COUNT"),
 320    COUNTER64("VS_INVOCATION_COUNT"),
 321    COUNTER64("GS_INVOCATION_COUNT"),
 322    COUNTER64("GS_PRIMITIVES_COUNT"),
 323    COUNTER64("CL_INVOCATION_COUNT"),
 324    COUNTER64("CL_PRIMITIVES_COUNT"),
 325    COUNTER64("PS_INVOCATION_COUNT"),
 326    COUNTER64("PS_DEPTH_COUNT"),
 327    COUNTER64("SO_NUM_PRIMS_WRITTEN"),
 328    COUNTER64("SO_PRIM_STORAGE_NEEDED"),
 329 };
 330
 331 /** MMIO register addresses for each pipeline statistics counter. */
 332 static const int gen6_statistics_register_addresses[] = {
 333    IA_VERTICES_COUNT,
 334    IA_PRIMITIVES_COUNT,
 335    VS_INVOCATION_COUNT,
 336    GS_INVOCATION_COUNT,
 337    GS_PRIMITIVES_COUNT,
 338    CL_INVOCATION_COUNT,
 339    CL_PRIMITIVES_COUNT,
 340    PS_INVOCATION_COUNT,
 341    PS_DEPTH_COUNT,
 342    GEN6_SO_NUM_PRIMS_WRITTEN,
 343    GEN6_SO_PRIM_STORAGE_NEEDED,
 344 };
 345
 346 static const struct gl_perf_monitor_group gen6_groups[] = {
 347    GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
 348    GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
 349 };
 350 /** @} */
 351
 352 /**
 353  * Ivybridge/Baytrail/Haswell:
 354  *  @{
 355  */
 356 static const struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
 357    COUNTER("Aggregated Core Array Active"),
 358    COUNTER("Aggregated Core Array Stalled"),
 359    COUNTER("Vertex Shader Active Time"),
 360    COUNTER("Vertex Shader Stall Time - Core Stall"),
 361    COUNTER("# VS threads loaded"),
 362    COUNTER("Hull Shader Active Time"),
 363    COUNTER("Hull Shader Stall Time - Core Stall"),
 364    COUNTER("# HS threads loaded"),
 365    COUNTER("Domain Shader Active Time"),
 366    COUNTER("Domain Shader Stall Time - Core Stall"),
 367    COUNTER("# DS threads loaded"),
 368    COUNTER("Compute Shader Active Time"),
 369    COUNTER("Compute Shader Stall Time - Core Stall"),
 370    COUNTER("# CS threads loaded"),
 371    COUNTER("Geometry Shader Active Time"),
 372    COUNTER("Geometry Shader Stall Time - Core Stall"),
 373    COUNTER("# GS threads loaded"),
 374    COUNTER("Pixel Shader Active Time"),
 375    COUNTER("Pixel Shader Stall Time - Core Stall"),
 376    COUNTER("# PS threads loaded"),
 377    COUNTER("HiZ Fast Z Test Pixels Passing"),
 378    COUNTER("HiZ Fast Z Test Pixels Failing"),
 379    COUNTER("Slow Z Test Pixels Passing"),
 380    COUNTER("Slow Z Test Pixels Failing"),
 381    COUNTER("Pixel Kill Count"),
 382    COUNTER("Alpha Test Pixels Failed"),
 383    COUNTER("Post PS Stencil Pixels Failed"),
 384    COUNTER("Post PS Z buffer Pixels Failed"),
 385    COUNTER("3D/GPGPU Render Target Writes"),
 386    COUNTER("Render Engine Busy"),
 387    COUNTER("VS bottleneck"),
 388    COUNTER("GS bottleneck"),
 389 };
 390
 391 /**
 392  * Ivybridge/Baytrail/Haswell: Counter Select = 101
 393  * A4   A3   A2   A1   A0   TIMESTAMP  ReportID
 394  * A12  A11  A10  A9   A8   A7   A6    A5
 395  * A20  A19  A18  A17  A16  A15  A14   A13
 396  * A28  A27  A26  A25  A24  A23  A22   A21
 397  * A36  A35  A34  A33  A32  A31  A30   A29
 398  * A44  A43  A42  A41  A40  A39  A38   A37
 399  * B7   B6   B5   B4   B3   B2   B1    B0
 400  * Rsv  Rsv  Rsv  Rsv  Rsv  Rsv  Rsv   Rsv
 401  */
 402 static const int gen7_oa_snapshot_layout[] =
 403 {
 404    -1, /* Report ID */
 405    -1, /* TIMESTAMP (64-bit) */
 406    -1, /* ...second half... */
 407     0, /* A0:  Aggregated Core Array Active */
 408     1, /* A1:  Aggregated Core Array Stalled */
 409     2, /* A2:  Vertex Shader Active Time */
 410    -1, /* A3:  Reserved */
 411     3, /* A4:  Vertex Shader Stall Time - Core Stall */
 412     4, /* A5:  # VS threads loaded */
 413    -1, /* A6:  Reserved */
 414     5, /* A7:  Hull Shader Active Time */
 415    -1, /* A8:  Reserved */
 416     6, /* A9:  Hull Shader Stall Time - Core Stall */
 417     7, /* A10: # HS threads loaded */
 418    -1, /* A11: Reserved */
 419     8, /* A12: Domain Shader Active Time */
 420    -1, /* A13: Reserved */
 421     9, /* A14: Domain Shader Stall Time - Core Stall */
 422    10, /* A15: # DS threads loaded */
 423    -1, /* A16: Reserved */
 424    11, /* A17: Compute Shader Active Time */
 425    -1, /* A18: Reserved */
 426    12, /* A19: Compute Shader Stall Time - Core Stall */
 427    13, /* A20: # CS threads loaded */
 428    -1, /* A21: Reserved */
 429    14, /* A22: Geometry Shader Active Time */
 430    -1, /* A23: Reserved */
 431    15, /* A24: Geometry Shader Stall Time - Core Stall */
 432    16, /* A25: # GS threads loaded */
 433    -1, /* A26: Reserved */
 434    17, /* A27: Pixel Shader Active Time */
 435    -1, /* A28: Reserved */
 436    18, /* A29: Pixel Shader Stall Time - Core Stall */
 437    19, /* A30: # PS threads loaded */
 438    -1, /* A31: Reserved */
 439    20, /* A32: HiZ Fast Z Test Pixels Passing */
 440    21, /* A33: HiZ Fast Z Test Pixels Failing */
 441    22, /* A34: Slow Z Test Pixels Passing */
 442    23, /* A35: Slow Z Test Pixels Failing */
 443    24, /* A36: Pixel Kill Count */
 444    25, /* A37: Alpha Test Pixels Failed */
 445    26, /* A38: Post PS Stencil Pixels Failed */
 446    27, /* A39: Post PS Z buffer Pixels Failed */
 447    28, /* A40: 3D/GPGPU Render Target Writes */
 448    29, /* A41: Render Engine Busy */
 449    30, /* A42: VS bottleneck */
 450    31, /* A43: GS bottleneck */
 451    -1, /* A44: Reserved */
 452    -1, /* B0 */
 453    -1, /* B1 */
 454    -1, /* B2 */
 455    -1, /* B3 */
 456    -1, /* B4 */
 457    -1, /* B5 */
 458    -1, /* B6 */
 459    -1, /* B7 */
 460    -1, /* Reserved */
 461    -1, /* Reserved */
 462    -1, /* Reserved */
 463    -1, /* Reserved */
 464    -1, /* Reserved */
 465    -1, /* Reserved */
 466    -1, /* Reserved */
 467    -1, /* Reserved */
 468 };
 469
 470 static const struct gl_perf_monitor_counter gen7_statistics_counters[] = {
 471    COUNTER64("IA_VERTICES_COUNT"),
 472    COUNTER64("IA_PRIMITIVES_COUNT"),
 473    COUNTER64("VS_INVOCATION_COUNT"),
 474    COUNTER64("HS_INVOCATION_COUNT"),
 475    COUNTER64("DS_INVOCATION_COUNT"),
 476    COUNTER64("GS_INVOCATION_COUNT"),
 477    COUNTER64("GS_PRIMITIVES_COUNT"),
 478    COUNTER64("CL_INVOCATION_COUNT"),
 479    COUNTER64("CL_PRIMITIVES_COUNT"),
 480    COUNTER64("PS_INVOCATION_COUNT"),
 481    COUNTER64("PS_DEPTH_COUNT"),
 482    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
 483    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
 484    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
 485    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
 486    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
 487    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
 488    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
 489    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
 490 };
 491
 492 /** MMIO register addresses for each pipeline statistics counter. */
 493 static const int gen7_statistics_register_addresses[] = {
 494    IA_VERTICES_COUNT,
 495    IA_PRIMITIVES_COUNT,
 496    VS_INVOCATION_COUNT,
 497    HS_INVOCATION_COUNT,
 498    DS_INVOCATION_COUNT,
 499    GS_INVOCATION_COUNT,
 500    GS_PRIMITIVES_COUNT,
 501    CL_INVOCATION_COUNT,
 502    CL_PRIMITIVES_COUNT,
 503    PS_INVOCATION_COUNT,
 504    PS_DEPTH_COUNT,
 505    GEN7_SO_NUM_PRIMS_WRITTEN(0),
 506    GEN7_SO_NUM_PRIMS_WRITTEN(1),
 507    GEN7_SO_NUM_PRIMS_WRITTEN(2),
 508    GEN7_SO_NUM_PRIMS_WRITTEN(3),
 509    GEN7_SO_PRIM_STORAGE_NEEDED(0),
 510    GEN7_SO_PRIM_STORAGE_NEEDED(1),
 511    GEN7_SO_PRIM_STORAGE_NEEDED(2),
 512    GEN7_SO_PRIM_STORAGE_NEEDED(3),
 513 };
 514
 515 static const struct gl_perf_monitor_group gen7_groups[] = {
 516    GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
 517    GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
 518 };
 519 /** @} */
 520
 521 /******************************************************************************/
 522
 523 static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *);
 524
 525 static void
 526 dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
 527 {
 528    struct brw_context *brw = brw_void;
 529    struct gl_context *ctx = brw_void;
 530    struct gl_perf_monitor_object *m = monitor_void;
 531    struct brw_perf_monitor_object *monitor = monitor_void;
 532
 533    const char *resolved = "";
 534    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
 535       if (brw->perfmon.unresolved[i] == monitor) {
 536          resolved = "Unresolved";
 537          break;
 538       }
 539    }
 540
 541    DBG("%4d  %-7s %-6s %-10s %-11s <%3d, %3d, %3d>  %-6s %-9s\n",
 542        name,
 543        m->Active ? "Active" : "",
 544        m->Ended ? "Ended" : "",
 545        resolved,
 546        brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
 547        monitor->oa_head_end,
 548        monitor->oa_middle_start,
 549        monitor->oa_tail_start,
 550        monitor->oa_bo ? "OA BO" : "",
 551        monitor->pipeline_stats_bo ? "Stats BO" : "");
 552 }
 553
 554 void
 555 brw_dump_perf_monitors(struct brw_context *brw)
 556 {
 557    struct gl_context *ctx = &brw->ctx;
 558    DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users);
 559    _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw);
 560 }
 561
 562 /******************************************************************************/
 563
 564 static bool
 565 monitor_needs_statistics_registers(struct brw_context *brw,
 566                                    struct gl_perf_monitor_object *m)
 567 {
 568    return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS];
 569 }
 570
 571 /**
 572  * Take a snapshot of any monitored pipeline statistics counters.
 573  */
 574 static void
 575 snapshot_statistics_registers(struct brw_context *brw,
 576                               struct brw_perf_monitor_object *monitor,
 577                               uint32_t offset)
 578 {
 579    struct gl_context *ctx = &brw->ctx;
 580    const int group = PIPELINE_STATS_COUNTERS;
 581    const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
 582
 583    brw_emit_mi_flush(brw);
 584
 585    for (int i = 0; i < num_counters; i++) {
 586       if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
 587          assert(ctx->PerfMonitor.Groups[group].Counters[i].Type ==
 588                 GL_UNSIGNED_INT64_AMD);
 589
 590          brw_store_register_mem64(brw, monitor->pipeline_stats_bo,
 591                                   brw->perfmon.statistics_registers[i],
 592                                   offset + i * sizeof(uint64_t));
 593       }
 594    }
 595 }
 596
 597 /**
 598  * Gather results from pipeline_stats_bo, storing the final values.
 599  *
 600  * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
 601  * smaller array of final results.
 602  */
 603 static void
 604 gather_statistics_results(struct brw_context *brw,
 605                           struct brw_perf_monitor_object *monitor)
 606 {
 607    struct gl_context *ctx = &brw->ctx;
 608    const int num_counters =
 609       ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
 610
 611    monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
 612    if (monitor->pipeline_stats_results == NULL) {
 613       _mesa_error_no_memory(__func__);
 614       return;
 615    }
 616
 617    drm_intel_bo_map(monitor->pipeline_stats_bo, false);
 618    uint64_t *start = monitor->pipeline_stats_bo->virtual;
 619    uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
 620
 621    for (int i = 0; i < num_counters; i++) {
 622       monitor->pipeline_stats_results[i] = end[i] - start[i];
 623    }
 624    drm_intel_bo_unmap(monitor->pipeline_stats_bo);
 625    drm_intel_bo_unreference(monitor->pipeline_stats_bo);
 626    monitor->pipeline_stats_bo = NULL;
 627 }
 628
 629 /******************************************************************************/
 630
 631 static bool
 632 monitor_needs_oa(struct brw_context *brw,
 633                  struct gl_perf_monitor_object *m)
 634 {
 635    return m->ActiveGroups[OA_COUNTERS];
 636 }
 637
 638 /**
 639  * Enable the Observability Architecture counters by whacking OACONTROL.
 640  */
 641 static void
 642 start_oa_counters(struct brw_context *brw)
 643 {
 644    unsigned counter_format;
 645
 646    /* Pick the counter format which gives us all the counters. */
 647    switch (brw->gen) {
 648    case 5:
 649       return; /* Ironlake counters are always running. */
 650    case 6:
 651       counter_format = 0b001;
 652       break;
 653    case 7:
 654       counter_format = 0b101;
 655       break;
 656    default:
 657       unreachable("Tried to enable OA counters on an unsupported generation.");
 658    }
 659
 660    BEGIN_BATCH(3);
 661    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 662    OUT_BATCH(OACONTROL);
 663    OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
 664              OACONTROL_ENABLE_COUNTERS);
 665    ADVANCE_BATCH();
 666 }
 667
 668 /**
 669  * Disable OA counters.
 670  */
 671 static void
 672 stop_oa_counters(struct brw_context *brw)
 673 {
 674    /* Ironlake counters never stop. */
 675    if (brw->gen == 5)
 676       return;
 677
 678    BEGIN_BATCH(3);
 679    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 680    OUT_BATCH(OACONTROL);
 681    OUT_BATCH(0);
 682    ADVANCE_BATCH();
 683 }
 684
 685 /**
 686  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
 687  * including the required PIPE_CONTROL flushes.
 688  *
 689  * Sandybridge is the worst case scenario: brw_emit_mi_flush
 690  * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
 691  * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
 692  * the 3 DWords for MI_REPORT_PERF_COUNT itself.
 693  */
 694 #define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
 695
 696 /**
 697  * Emit an MI_REPORT_PERF_COUNT command packet.
 698  *
 699  * This writes the current OA counter values to buffer.
 700  */
 701 static void
 702 emit_mi_report_perf_count(struct brw_context *brw,
 703                           drm_intel_bo *bo,
 704                           uint32_t offset_in_bytes,
 705                           uint32_t report_id)
 706 {
 707    assert(offset_in_bytes % 64 == 0);
 708
 709    /* Make sure the commands to take a snapshot fits in a single batch. */
 710    intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
 711                                    RENDER_RING);
 712    int batch_used = USED_BATCH(brw->batch);
 713
 714    /* Reports apparently don't always get written unless we flush first. */
 715    brw_emit_mi_flush(brw);
 716
 717    if (brw->gen == 5) {
 718       /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
 719        * the counters.  The report ID is ignored in the second set.
 720        */
 721       BEGIN_BATCH(6);
 722       OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
 723       OUT_RELOC(bo,
 724                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 725                 offset_in_bytes);
 726       OUT_BATCH(report_id);
 727
 728       OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
 729       OUT_RELOC(bo,
 730                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 731                 offset_in_bytes + 64);
 732       OUT_BATCH(report_id);
 733       ADVANCE_BATCH();
 734    } else if (brw->gen == 6) {
 735       BEGIN_BATCH(3);
 736       OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
 737       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 738                 offset_in_bytes | MI_COUNTER_ADDRESS_GTT);
 739       OUT_BATCH(report_id);
 740       ADVANCE_BATCH();
 741    } else if (brw->gen == 7) {
 742       BEGIN_BATCH(3);
 743       OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
 744       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 745                 offset_in_bytes);
 746       OUT_BATCH(report_id);
 747       ADVANCE_BATCH();
 748    } else {
 749       unreachable("Unsupported generation for performance counters.");
 750    }
 751
 752    /* Reports apparently don't always get written unless we flush after. */
 753    brw_emit_mi_flush(brw);
 754
 755    (void) batch_used;
 756    assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
 757 }
 758
 759 /**
 760  * Add a monitor to the global list of "unresolved monitors."
 761  *
 762  * Monitors are "unresolved" if they refer to OA counter snapshots in
 763  * bookend_bo.  Results (even partial ones) must be gathered for all
 764  * unresolved monitors before it's safe to discard bookend_bo.
 765  */
 766 static void
 767 add_to_unresolved_monitor_list(struct brw_context *brw,
 768                                struct brw_perf_monitor_object *monitor)
 769 {
 770    if (brw->perfmon.unresolved_elements >=
 771        brw->perfmon.unresolved_array_size) {
 772       brw->perfmon.unresolved_array_size *= 2;
 773       brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved,
 774                                          struct brw_perf_monitor_object *,
 775                                          brw->perfmon.unresolved_array_size);
 776    }
 777
 778    brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor;
 779 }
 780
 781 /**
 782  * If possible, throw away the contents of bookend BO.
 783  *
 784  * When all monitoring stops, and no monitors need data from bookend_bo to
 785  * compute results, we can discard it and start writing snapshots at the
 786  * beginning again.  This helps reduce the amount of buffer wraparound.
 787  */
 788 static void
 789 clean_bookend_bo(struct brw_context *brw)
 790 {
 791    if (brw->perfmon.unresolved_elements == 0) {
 792       DBG("***Resetting bookend snapshots to 0\n");
 793       brw->perfmon.bookend_snapshots = 0;
 794    }
 795 }
 796
 797 /**
 798  * Remove a monitor from the global list of "unresolved monitors."
 799  *
 800  * This can happen when:
 801  * - We finish computing a completed monitor's results.
 802  * - We discard unwanted monitor results.
 803  * - A monitor's results can be computed without relying on bookend_bo.
 804  */
 805 static void
 806 drop_from_unresolved_monitor_list(struct brw_context *brw,
 807                                   struct brw_perf_monitor_object *monitor)
 808 {
 809    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
 810       if (brw->perfmon.unresolved[i] == monitor) {
 811          int last_elt = --brw->perfmon.unresolved_elements;
 812
 813          if (i == last_elt) {
 814             brw->perfmon.unresolved[i] = NULL;
 815          } else {
 816             brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt];
 817          }
 818
 819          clean_bookend_bo(brw);
 820          return;
 821       }
 822    }
 823 }
 824
 825 /**
 826  * Given pointers to starting and ending OA snapshots, add the deltas for each
 827  * counter to the results.
 828  */
 829 static void
 830 add_deltas(struct brw_context *brw,
 831            struct brw_perf_monitor_object *monitor,
 832            uint32_t *start, uint32_t *end)
 833 {
 834    /* Look for expected report ID values to ensure data is present. */
 835    assert(start[0] == REPORT_ID);
 836    assert(end[0] == REPORT_ID);
 837
 838    /* Subtract each counter's ending and starting values, then add the
 839     * difference to the counter's value so far.
 840     */
 841    for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) {
 842       /* When debugging, it's useful to note when the ending value is less than
 843        * the starting value; aggregating counters should always increase in
 844        * value (or remain unchanged).  This happens periodically due to
 845        * wraparound, but can also indicate serious problems.
 846        */
 847 #ifdef DEBUG
 848       if (end[i] < start[i]) {
 849          int counter = brw->perfmon.oa_snapshot_layout[i];
 850          if (counter >= 0) {
 851             DBG("WARNING: \"%s\" ending value was less than the starting "
 852                 "value: %u < %u (end - start = %u)\n",
 853                 brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name,
 854                 end[i], start[i], end[i] - start[i]);
 855          }
 856       }
 857 #endif
 858       monitor->oa_results[i] += end[i] - start[i];
 859    }
 860 }
 861
 862 /**
 863  * Gather OA counter results (partial or full) from a series of snapshots.
 864  *
 865  * Monitoring can start or stop at any time, likely at some point mid-batch.
 866  * We write snapshots for both events, storing them in monitor->oa_bo.
 867  *
 868  * Ideally, we would simply subtract those two snapshots to obtain the final
 869  * counter results.  Unfortunately, our hardware doesn't preserve their values
 870  * across context switches or GPU sleep states.  In order to support multiple
 871  * concurrent OA clients, as well as reliable data across power management,
 872  * we have to take snapshots at the start and end of batches as well.
 873  *
 874  * This results in a three-part sequence of (start, end) intervals:
 875  * - The "head" is from the BeginPerfMonitor snapshot to the end of the first
 876  *   batchbuffer.
 877  * - The "middle" is a series of (batch start, batch end) snapshots which
 878  *   bookend any batchbuffers between the ones which start/end monitoring.
 879  * - The "tail" is from the start of the last batch where monitoring was
 880  *   active to the EndPerfMonitor snapshot.
 881  *
 882  * Due to wrapping in the bookend BO, we may have to accumulate partial results.
 883  * If so, we handle the "head" and any "middle" results so far.  When monitoring
 884  * eventually ends, we handle additional "middle" batches and the "tail."
 885  */
 886 static void
 887 gather_oa_results(struct brw_context *brw,
 888                   struct brw_perf_monitor_object *monitor,
 889                   uint32_t *bookend_buffer)
 890 {
 891    struct gl_perf_monitor_object *m = &monitor->base;
 892    assert(monitor->oa_bo != NULL);
 893
 894    drm_intel_bo_map(monitor->oa_bo, false);
 895    uint32_t *monitor_buffer = monitor->oa_bo->virtual;
 896
 897    /* If monitoring was entirely contained within a single batch, then the
 898     * bookend BO is irrelevant.  Just subtract monitor->bo's two snapshots.
 899     */
 900    if (monitor->oa_middle_start == -1) {
 901       add_deltas(brw, monitor,
 902                  monitor_buffer,
 903                  monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
 904                                    sizeof(uint32_t)));
 905       drm_intel_bo_unmap(monitor->oa_bo);
 906       return;
 907    }
 908
 909    const ptrdiff_t snapshot_size = brw->perfmon.entries_per_oa_snapshot;
 910
 911    /* First, add the contributions from the "head" interval:
 912     * (snapshot taken at BeginPerfMonitor time,
 913     *  snapshot taken at the end of the first batch after monitoring began)
 914     */
 915    if (monitor->oa_head_end != -1) {
 916       assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots);
 917       add_deltas(brw, monitor,
 918                  monitor_buffer,
 919                  bookend_buffer + snapshot_size * monitor->oa_head_end);
 920
 921       /* Make sure we don't count the "head" again in the future. */
 922       monitor->oa_head_end = -1;
 923    }
 924
 925    /* Next, count the contributions from the "middle" batches.  These are
 926     * (batch begin, batch end) deltas while monitoring was active.
 927     */
 928    int last_snapshot;
 929    if (m->Ended)
 930       last_snapshot = monitor->oa_tail_start;
 931    else
 932       last_snapshot = brw->perfmon.bookend_snapshots;
 933
 934    for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) {
 935       add_deltas(brw, monitor,
 936                  bookend_buffer + snapshot_size * s,
 937                  bookend_buffer + snapshot_size * (s + 1));
 938    }
 939
 940    /* Finally, if the monitor has ended, we need to count the contributions of
 941     * the "tail" interval:
 942     * (start of the batch where monitoring ended, EndPerfMonitor snapshot)
 943     */
 944    if (m->Ended) {
 945       assert(monitor->oa_tail_start != -1);
 946       add_deltas(brw, monitor,
 947                  bookend_buffer + snapshot_size * monitor->oa_tail_start,
 948                  monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
 949                                    sizeof(uint32_t)));
 950    }
 951
 952    drm_intel_bo_unmap(monitor->oa_bo);
 953
 954    /* If the monitor has ended, then we've gathered all the results, and
 955     * can free the monitor's OA BO.
 956     */
 957    if (m->Ended) {
 958       drm_intel_bo_unreference(monitor->oa_bo);
 959       monitor->oa_bo = NULL;
 960
 961       /* The monitor's OA result is now resolved. */
 962       DBG("Marking %d resolved - results gathered\n", m->Name);
 963       drop_from_unresolved_monitor_list(brw, monitor);
 964    }
 965 }
 966
 967 /**
 968  * Handle running out of space in the bookend BO.
 969  *
 970  * When we run out of space in the bookend BO, we need to gather up partial
 971  * results for every unresolved monitor.  This allows us to free the snapshot
 972  * data in bookend_bo, freeing up the space for reuse.  We call this "wrapping."
 973  *
 974  * This will completely compute the result for any unresolved monitors that
 975  * have ended.
 976  */
 977 static void
 978 wrap_bookend_bo(struct brw_context *brw)
 979 {
 980    DBG("****Wrap bookend BO****\n");
 981    /* Note that wrapping will only occur at the start of a batch, since that's
 982     * where we reserve space.  So the current batch won't reference bookend_bo
 983     * or any monitor BOs.  This means we don't need to worry about
 984     * synchronization.
 985     *
 986     * Also, EndPerfMonitor guarantees that only monitors which span multiple
 987     * batches exist in the unresolved monitor list.
 988     */
 989    assert(brw->perfmon.oa_users > 0);
 990
 991    drm_intel_bo_map(brw->perfmon.bookend_bo, false);
 992    uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
 993    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
 994       struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
 995       struct gl_perf_monitor_object *m = &monitor->base;
 996
 997       gather_oa_results(brw, monitor, bookend_buffer);
 998
 999       if (m->Ended) {
1000          /* gather_oa_results() dropped the monitor from the unresolved list,
1001           * throwing our indices off by one.
1002           */
1003          --i;
1004       } else {
1005          /* When we create the new bookend_bo, snapshot #0 will be the
1006           * beginning of another "middle" BO.
1007           */
1008          monitor->oa_middle_start = 0;
1009          assert(monitor->oa_head_end == -1);
1010          assert(monitor->oa_tail_start == -1);
1011       }
1012    }
1013    drm_intel_bo_unmap(brw->perfmon.bookend_bo);
1014
1015    brw->perfmon.bookend_snapshots = 0;
1016 }
1017
1018 /* This is fairly arbitrary; the trade off is memory usage vs. extra overhead
1019  * from wrapping.  On Gen7, 32768 should be enough for 128 snapshots before
1020  * wrapping (since each is 256 bytes).
1021  */
1022 #define BOOKEND_BO_SIZE_BYTES 32768
1023
1024 /**
1025  * Check whether bookend_bo has space for a given number of snapshots.
1026  */
1027 static bool
1028 has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots)
1029 {
1030    int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
1031
1032    /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */
1033    int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots;
1034
1035    return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES;
1036 }
1037
1038 /**
1039  * Write an OA counter snapshot to bookend_bo.
1040  */
1041 static void
1042 emit_bookend_snapshot(struct brw_context *brw)
1043 {
1044    int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
1045    int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes;
1046
1047    emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes,
1048                              REPORT_ID);
1049    ++brw->perfmon.bookend_snapshots;
1050 }
1051
1052 /******************************************************************************/
1053
1054 /**
1055  * Initialize a monitor to sane starting state; throw away old buffers.
1056  */
1057 static void
1058 reinitialize_perf_monitor(struct brw_context *brw,
1059                           struct brw_perf_monitor_object *monitor)
1060 {
1061    if (monitor->oa_bo) {
1062       drm_intel_bo_unreference(monitor->oa_bo);
1063       monitor->oa_bo = NULL;
1064    }
1065
1066    /* Since the results are now invalid, we don't need to hold on to any
1067     * snapshots in bookend_bo.  The monitor is effectively "resolved."
1068     */
1069    drop_from_unresolved_monitor_list(brw, monitor);
1070
1071    monitor->oa_head_end = -1;
1072    monitor->oa_middle_start = -1;
1073    monitor->oa_tail_start = -1;
1074
1075    free(monitor->oa_results);
1076    monitor->oa_results = NULL;
1077
1078    if (monitor->pipeline_stats_bo) {
1079       drm_intel_bo_unreference(monitor->pipeline_stats_bo);
1080       monitor->pipeline_stats_bo = NULL;
1081    }
1082
1083    free(monitor->pipeline_stats_results);
1084    monitor->pipeline_stats_results = NULL;
1085 }
1086
1087 /**
1088  * Driver hook for glBeginPerformanceMonitorAMD().
1089  */
1090 static GLboolean
1091 brw_begin_perf_monitor(struct gl_context *ctx,
1092                        struct gl_perf_monitor_object *m)
1093 {
1094    struct brw_context *brw = brw_context(ctx);
1095    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1096
1097    DBG("Begin(%d)\n", m->Name);
1098
1099    reinitialize_perf_monitor(brw, monitor);
1100
1101    if (monitor_needs_oa(brw, m)) {
1102       /* If the global OA bookend BO doesn't exist, allocate it.  This should
1103        * only happen once, but we delay until BeginPerfMonitor time to avoid
1104        * wasting memory for contexts that don't use performance monitors.
1105        */
1106       if (!brw->perfmon.bookend_bo) {
1107          brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
1108                                                       "OA bookend BO",
1109                                                       BOOKEND_BO_SIZE_BYTES, 64);
1110       }
1111
1112       monitor->oa_bo =
1113          drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
1114 #ifdef DEBUG
1115       /* Pre-filling the BO helps debug whether writes landed. */
1116       drm_intel_bo_map(monitor->oa_bo, true);
1117       memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
1118       drm_intel_bo_unmap(monitor->oa_bo);
1119 #endif
1120
1121       /* Allocate storage for accumulated OA counter values. */
1122       monitor->oa_results =
1123          calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
1124
1125       /* If the OA counters aren't already on, enable them. */
1126       if (brw->perfmon.oa_users == 0) {
1127          /* Ensure the OACONTROL enable and snapshot land in the same batch. */
1128          int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
1129          intel_batchbuffer_require_space(brw, space, RENDER_RING);
1130          start_oa_counters(brw);
1131       }
1132
1133       /* Take a starting OA counter snapshot. */
1134       emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
1135
1136       monitor->oa_head_end = brw->perfmon.bookend_snapshots;
1137       monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
1138       monitor->oa_tail_start = -1;
1139
1140       /* Add the monitor to the unresolved list. */
1141       add_to_unresolved_monitor_list(brw, monitor);
1142
1143       ++brw->perfmon.oa_users;
1144    }
1145
1146    if (monitor_needs_statistics_registers(brw, m)) {
1147       monitor->pipeline_stats_bo =
1148          drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
1149
1150       /* Take starting snapshots. */
1151       snapshot_statistics_registers(brw, monitor, 0);
1152    }
1153
1154    return true;
1155 }
1156
1157 /**
1158  * Driver hook for glEndPerformanceMonitorAMD().
1159  */
1160 static void
1161 brw_end_perf_monitor(struct gl_context *ctx,
1162                      struct gl_perf_monitor_object *m)
1163 {
1164    struct brw_context *brw = brw_context(ctx);
1165    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1166
1167    DBG("End(%d)\n", m->Name);
1168
1169    if (monitor_needs_oa(brw, m)) {
1170       /* Take an ending OA counter snapshot. */
1171       emit_mi_report_perf_count(brw, monitor->oa_bo,
1172                                 SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID);
1173
1174       --brw->perfmon.oa_users;
1175
1176       if (brw->perfmon.oa_users == 0)
1177          stop_oa_counters(brw);
1178
1179       if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) {
1180          assert(monitor->oa_head_end != -1);
1181          /* We never actually wrote the snapshot for the end of the first batch
1182           * after BeginPerfMonitor.  This means that monitoring was contained
1183           * entirely within a single batch, so we can ignore bookend_bo and
1184           * just compare the monitor's begin/end snapshots directly.
1185           */
1186          monitor->oa_head_end = -1;
1187          monitor->oa_middle_start = -1;
1188          monitor->oa_tail_start = -1;
1189
1190          /* We can also mark it resolved since it won't depend on bookend_bo. */
1191          DBG("Marking %d resolved - entirely in one batch\n", m->Name);
1192          drop_from_unresolved_monitor_list(brw, monitor);
1193       } else {
1194          /* We've written at least one batch end snapshot, so the monitoring
1195           * spanned multiple batches.  Mark which snapshot corresponds to the
1196           * start of the current batch.
1197           */
1198          monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1;
1199       }
1200    }
1201
1202    if (monitor_needs_statistics_registers(brw, m)) {
1203       /* Take ending snapshots. */
1204       snapshot_statistics_registers(brw, monitor,
1205                                     SECOND_SNAPSHOT_OFFSET_IN_BYTES);
1206    }
1207 }
1208
1209 /**
1210  * Reset a performance monitor, throwing away any results.
1211  */
1212 static void
1213 brw_reset_perf_monitor(struct gl_context *ctx,
1214                        struct gl_perf_monitor_object *m)
1215 {
1216    struct brw_context *brw = brw_context(ctx);
1217    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1218
1219    reinitialize_perf_monitor(brw, monitor);
1220
1221    if (m->Active) {
1222       brw_begin_perf_monitor(ctx, m);
1223    }
1224 }
1225
1226 /**
1227  * Is a performance monitor result available?
1228  */
1229 static GLboolean
1230 brw_is_perf_monitor_result_available(struct gl_context *ctx,
1231                                      struct gl_perf_monitor_object *m)
1232 {
1233    struct brw_context *brw = brw_context(ctx);
1234    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1235
1236    bool oa_available = true;
1237    bool stats_available = true;
1238
1239    if (monitor_needs_oa(brw, m)) {
1240       oa_available = !monitor->oa_bo ||
1241          (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
1242           !drm_intel_bo_busy(monitor->oa_bo));
1243    }
1244
1245    if (monitor_needs_statistics_registers(brw, m)) {
1246       stats_available = !monitor->pipeline_stats_bo ||
1247          (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
1248           !drm_intel_bo_busy(monitor->pipeline_stats_bo));
1249    }
1250
1251    return oa_available && stats_available;
1252 }
1253
1254 /**
1255  * Get the performance monitor result.
1256  */
1257 static void
1258 brw_get_perf_monitor_result(struct gl_context *ctx,
1259                             struct gl_perf_monitor_object *m,
1260                             GLsizei data_size,
1261                             GLuint *data,
1262                             GLint *bytes_written)
1263 {
1264    struct brw_context *brw = brw_context(ctx);
1265    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1266    const GLuint *const data_end = (GLuint *)((uint8_t *) data + data_size);
1267
1268    DBG("GetResult(%d)\n", m->Name);
1269    brw_dump_perf_monitors(brw);
1270
1271    /* This hook should only be called when results are available. */
1272    assert(m->Ended);
1273
1274    /* Copy data to the supplied array (data).
1275     *
1276     * The output data format is: <group ID, counter ID, value> for each
1277     * active counter.  The API allows counters to appear in any order.
1278     */
1279    GLsizei offset = 0;
1280
1281    if (monitor_needs_oa(brw, m)) {
1282       /* Gather up the results from the BO, unless we already did due to the
1283        * bookend BO wrapping.
1284        */
1285       if (monitor->oa_bo) {
1286          /* Since the result is available, all the necessary snapshots will
1287           * have been written to the bookend BO.  If other monitors are
1288           * active, the bookend BO may be busy or referenced by the current
1289           * batch, but only for writing snapshots beyond oa_tail_start,
1290           * which we don't care about.
1291           *
1292           * Using an unsynchronized mapping avoids stalling for an
1293           * indeterminate amount of time.
1294           */
1295          drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
1296
1297          gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
1298
1299          drm_intel_bo_unmap(brw->perfmon.bookend_bo);
1300       }
1301
1302       for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
1303          int group = OA_COUNTERS;
1304          int counter = brw->perfmon.oa_snapshot_layout[i];
1305
1306          /* We always capture all the OA counters, but the application may
1307           * have only asked for a subset.  Skip unwanted counters.
1308           */
1309          if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
1310             continue;
1311
1312          if (data + offset + 3 <= data_end) {
1313             data[offset++] = group;
1314             data[offset++] = counter;
1315             data[offset++] = monitor->oa_results[i];
1316          }
1317       }
1318
1319       clean_bookend_bo(brw);
1320    }
1321
1322    if (monitor_needs_statistics_registers(brw, m)) {
1323       const int num_counters =
1324          ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
1325
1326       if (!monitor->pipeline_stats_results) {
1327          gather_statistics_results(brw, monitor);
1328
1329          /* Check if we did really get the results */
1330          if (!monitor->pipeline_stats_results) {
1331             if (bytes_written) {
1332                *bytes_written = 0;
1333             }
1334             return;
1335          }
1336       }
1337
1338       for (int i = 0; i < num_counters; i++) {
1339          if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
1340             if (data + offset + 4 <= data_end) {
1341                data[offset++] = PIPELINE_STATS_COUNTERS;
1342                data[offset++] = i;
1343                *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
1344                offset += 2;
1345             }
1346          }
1347       }
1348    }
1349
1350    if (bytes_written)
1351       *bytes_written = offset * sizeof(uint32_t);
1352 }
1353
1354 /**
1355  * Create a new performance monitor object.
1356  */
1357 static struct gl_perf_monitor_object *
1358 brw_new_perf_monitor(struct gl_context *ctx)
1359 {
1360    (void) ctx;
1361    return calloc(1, sizeof(struct brw_perf_monitor_object));
1362 }
1363
1364 /**
1365  * Delete a performance monitor object.
1366  */
1367 static void
1368 brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
1369 {
1370    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1371    DBG("Delete(%d)\n", m->Name);
1372    reinitialize_perf_monitor(brw_context(ctx), monitor);
1373    free(monitor);
1374 }
1375
1376 /******************************************************************************/
1377
1378 /**
1379  * Called at the start of every render ring batch.
1380  *
1381  * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot.
1382  * Since it's a new batch, there will be plenty of space for the commands.
1383  */
1384 void
1385 brw_perf_monitor_new_batch(struct brw_context *brw)
1386 {
1387    assert(brw->batch.ring == RENDER_RING);
1388    assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
1389
1390    if (brw->perfmon.oa_users == 0)
1391       return;
1392
1393    start_oa_counters(brw);
1394
1395    /* Make sure bookend_bo has enough space for a pair of snapshots.
1396     * If not, "wrap" the BO: gather up any results so far, and start from
1397     * the beginning of the buffer.  Reserving a pair guarantees that wrapping
1398     * will only happen at the beginning of a batch, where it's safe to map BOs
1399     * (as the batch is empty and can't refer to any of them yet).
1400     */
1401    if (!has_space_for_bookend_snapshots(brw, 2))
1402       wrap_bookend_bo(brw);
1403
1404    DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
1405    emit_bookend_snapshot(brw);
1406 }
1407
1408 /**
1409  * Called at the end of every render ring batch.
1410  *
1411  * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters.
1412  *
1413  * This relies on there being enough space in BATCH_RESERVED.
1414  */
1415 void
1416 brw_perf_monitor_finish_batch(struct brw_context *brw)
1417 {
1418    assert(brw->batch.ring == RENDER_RING);
1419
1420    if (brw->perfmon.oa_users == 0)
1421       return;
1422
1423    DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
1424
1425    /* Not safe to wrap; should've reserved space already. */
1426    assert(has_space_for_bookend_snapshots(brw, 1));
1427
1428    emit_bookend_snapshot(brw);
1429
1430    stop_oa_counters(brw);
1431 }
1432
1433 /******************************************************************************/
1434
1435 void
1436 brw_init_performance_monitors(struct brw_context *brw)
1437 {
1438    struct gl_context *ctx = &brw->ctx;
1439
1440    ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
1441    ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
1442    ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
1443    ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
1444    ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
1445    ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
1446    ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
1447
1448    if (brw->gen == 5) {
1449       ctx->PerfMonitor.Groups = gen5_groups;
1450       ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
1451       brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout;
1452       brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout);
1453    } else if (brw->gen == 6) {
1454       ctx->PerfMonitor.Groups = gen6_groups;
1455       ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups);
1456       brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout;
1457       brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout);
1458       brw->perfmon.statistics_registers = gen6_statistics_register_addresses;
1459    } else if (brw->gen == 7) {
1460       ctx->PerfMonitor.Groups = gen7_groups;
1461       ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups);
1462       brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout;
1463       brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
1464       brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
1465    }
1466
1467    brw->perfmon.unresolved =
1468       ralloc_array(brw, struct brw_perf_monitor_object *, 1);
1469    brw->perfmon.unresolved_elements = 0;
1470    brw->perfmon.unresolved_array_size = 1;
1471 }