src/mesa/drivers/dri/i965/brw_performance_monitor.c

   1 /*
   2  * Copyright © 2013 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file brw_performance_monitor.c
  26  *
  27  * Implementation of the GL_AMD_performance_monitor extension.
  28  *
  29  * On Gen5+ hardware, we have two sources of performance counter data:
  30  * the Observability Architecture counters (MI_REPORT_PERF_COUNT), and
  31  * the Pipeline Statistics Registers.  We expose both sets of raw data,
  32  * as well as some useful processed values.
  33  *
  34  * The Observability Architecture (OA) counters for Gen6+ are documented
  35  * in a separate document from the rest of the PRMs.  It is available at:
  36  * https://01.org/linuxgraphics/documentation/driver-documentation-prms
  37  * => 2013 Intel Core Processor Family => Observability Performance Counters
  38  * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell.)
  39  *
  40  * On Ironlake, the OA counters were called "CHAPS" counters.  Sadly, no public
  41  * documentation exists; our implementation is based on the source code for the
  42  * intel_perf_counters utility (which is available as part of intel-gpu-tools).
  43  */
  44
  45 #include <limits.h>
  46
  47 #include "main/bitset.h"
  48 #include "main/hash.h"
  49 #include "main/macros.h"
  50 #include "main/mtypes.h"
  51 #include "main/performance_monitor.h"
  52
  53 #include "glsl/ralloc.h"
  54
  55 #include "brw_context.h"
  56 #include "brw_defines.h"
  57 #include "intel_batchbuffer.h"
  58
  59 #define FILE_DEBUG_FLAG DEBUG_PERFMON
  60
  61 /**
  62  * i965 representation of a performance monitor object.
  63  */
  64 struct brw_perf_monitor_object
  65 {
  66    /** The base class. */
  67    struct gl_perf_monitor_object base;
  68
  69    /**
  70     * BO containing OA counter snapshots at monitor Begin/End time.
  71     */
  72    drm_intel_bo *oa_bo;
  73
  74    /** Indexes into bookend_bo (snapshot numbers) for various segments. */
  75    int oa_head_end;
  76    int oa_middle_start;
  77    int oa_tail_start;
  78
  79    /**
  80     * Storage for OA results accumulated so far.
  81     *
  82     * An array indexed by the counter ID in the OA_COUNTERS group.
  83     *
  84     * When we run out of space in bookend_bo, we compute the results so far
  85     * and add them to the value stored here.  Then, we can discard bookend_bo.
  86     */
  87    uint32_t *oa_results;
  88
  89    /**
  90     * BO containing starting and ending snapshots for any active pipeline
  91     * statistics counters.
  92     */
  93    drm_intel_bo *pipeline_stats_bo;
  94
  95    /**
  96     * Storage for final pipeline statistics counter results.
  97     */
  98    uint64_t *pipeline_stats_results;
  99 };
 100
 101 /** Downcasting convenience macro. */
 102 static inline struct brw_perf_monitor_object *
 103 brw_perf_monitor(struct gl_perf_monitor_object *m)
 104 {
 105    return (struct brw_perf_monitor_object *) m;
 106 }
 107
 108 #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
 109
 110 /* A random value used to ensure we're getting valid snapshots. */
 111 #define REPORT_ID 0xd2e9c607
 112
 113 /******************************************************************************/
 114
 115 #define COUNTER(name)           \
 116    {                            \
 117       .Name = name,             \
 118       .Type = GL_UNSIGNED_INT,  \
 119       .Minimum = { .u32 =  0 }, \
 120       .Maximum = { .u32 = ~0 }, \
 121    }
 122
 123 #define COUNTER64(name)              \
 124    {                                 \
 125       .Name = name,                  \
 126       .Type = GL_UNSIGNED_INT64_AMD, \
 127       .Minimum = { .u64 =  0 },      \
 128       .Maximum = { .u64 = ~0 },      \
 129    }
 130
 131 #define GROUP(name, max_active, counter_list)  \
 132    {                                           \
 133       .Name = name,                            \
 134       .MaxActiveCounters = max_active,         \
 135       .Counters = counter_list,                \
 136       .NumCounters = ARRAY_SIZE(counter_list), \
 137    }
 138
 139 /** Performance Monitor Group IDs */
 140 enum brw_counter_groups {
 141    OA_COUNTERS, /* Observability Architecture (MI_REPORT_PERF_COUNT) Counters */
 142    PIPELINE_STATS_COUNTERS, /* Pipeline Statistics Register Counters */
 143 };
 144
 145 /**
 146  * Ironlake:
 147  *  @{
 148  *
 149  * The list of CHAPS counters unfortunately does not appear in any public
 150  * documentation, but is available by reading the source code for the
 151  * intel_perf_counters utility (shipped as part of intel-gpu-tools).
 152  */
 153 const static struct gl_perf_monitor_counter gen5_raw_chaps_counters[] = {
 154    COUNTER("cycles the CS unit is starved"),
 155    COUNTER("cycles the CS unit is stalled"),
 156    COUNTER("cycles the VF unit is starved"),
 157    COUNTER("cycles the VF unit is stalled"),
 158    COUNTER("cycles the VS unit is starved"),
 159    COUNTER("cycles the VS unit is stalled"),
 160    COUNTER("cycles the GS unit is starved"),
 161    COUNTER("cycles the GS unit is stalled"),
 162    COUNTER("cycles the CL unit is starved"),
 163    COUNTER("cycles the CL unit is stalled"),
 164    COUNTER("cycles the SF unit is starved"),
 165    COUNTER("cycles the SF unit is stalled"),
 166    COUNTER("cycles the WZ unit is starved"),
 167    COUNTER("cycles the WZ unit is stalled"),
 168    COUNTER("Z buffer read/write"),
 169    COUNTER("cycles each EU was active"),
 170    COUNTER("cycles each EU was suspended"),
 171    COUNTER("cycles threads loaded all EUs"),
 172    COUNTER("cycles filtering active"),
 173    COUNTER("cycles PS threads executed"),
 174    COUNTER("subspans written to RC"),
 175    COUNTER("bytes read for texture reads"),
 176    COUNTER("texels returned from sampler"),
 177    COUNTER("polygons not culled"),
 178    COUNTER("clocks MASF has valid message"),
 179    COUNTER("64b writes/reads from RC"),
 180    COUNTER("reads on dataport"),
 181    COUNTER("clocks MASF has valid msg not consumed by sampler"),
 182    COUNTER("cycles any EU is stalled for math"),
 183 };
 184
 185 const static int gen5_oa_snapshot_layout[] =
 186 {
 187    -1, /* Report ID */
 188    -1, /* TIMESTAMP (64-bit) */
 189    -1, /* ...second half... */
 190     0, /* cycles the CS unit is starved */
 191     1, /* cycles the CS unit is stalled */
 192     2, /* cycles the VF unit is starved */
 193     3, /* cycles the VF unit is stalled */
 194     4, /* cycles the VS unit is starved */
 195     5, /* cycles the VS unit is stalled */
 196     6, /* cycles the GS unit is starved */
 197     7, /* cycles the GS unit is stalled */
 198     8, /* cycles the CL unit is starved */
 199     9, /* cycles the CL unit is stalled */
 200    10, /* cycles the SF unit is starved */
 201    11, /* cycles the SF unit is stalled */
 202    12, /* cycles the WZ unit is starved */
 203    13, /* cycles the WZ unit is stalled */
 204    14, /* Z buffer read/write */
 205    15, /* cycles each EU was active */
 206    16, /* cycles each EU was suspended */
 207    17, /* cycles threads loaded all EUs */
 208    18, /* cycles filtering active */
 209    19, /* cycles PS threads executed */
 210    20, /* subspans written to RC */
 211    21, /* bytes read for texture reads */
 212    22, /* texels returned from sampler */
 213    23, /* polygons not culled */
 214    24, /* clocks MASF has valid message */
 215    25, /* 64b writes/reads from RC */
 216    26, /* reads on dataport */
 217    27, /* clocks MASF has valid msg not consumed by sampler */
 218    28, /* cycles any EU is stalled for math */
 219 };
 220
 221 const static struct gl_perf_monitor_group gen5_groups[] = {
 222    [OA_COUNTERS] = GROUP("CHAPS Counters", INT_MAX, gen5_raw_chaps_counters),
 223    /* Our pipeline statistics counter handling requires hardware contexts. */
 224 };
 225 /** @} */
 226
 227 /**
 228  * Sandybridge:
 229  *  @{
 230  *
 231  * A few of the counters here (A17-A20) are not included in the latest
 232  * documentation, but are described in the Ironlake PRM (which strangely
 233  * documents Sandybridge's performance counter system, not Ironlake's).
 234  * It's unclear whether they work or not; empirically, they appear to.
 235  */
 236
 237 /**
 238  * Aggregating counters A0-A28:
 239  */
 240 const static struct gl_perf_monitor_counter gen6_raw_oa_counters[] = {
 241    /* A0:   0 */ COUNTER("Aggregated Core Array Active"),
 242    /* A1:   1 */ COUNTER("Aggregated Core Array Stalled"),
 243    /* A2:   2 */ COUNTER("Vertex Shader Active Time"),
 244    /* A3: Not actually hooked up on Sandybridge. */
 245    /* A4:   3 */ COUNTER("Vertex Shader Stall Time - Core Stall"),
 246    /* A5:   4 */ COUNTER("# VS threads loaded"),
 247    /* A6:   5 */ COUNTER("Vertex Shader Ready but not running Time"),
 248    /* A7:   6 */ COUNTER("Geometry Shader Active Time"),
 249    /* A8: Not actually hooked up on Sandybridge. */
 250    /* A9:   7 */ COUNTER("Geometry Shader Stall Time - Core Stall"),
 251    /* A10:  8 */ COUNTER("# GS threads loaded"),
 252    /* A11:  9 */ COUNTER("Geometry Shader Ready but not running Time"),
 253    /* A12: 10 */ COUNTER("Pixel Shader Active Time"),
 254    /* A13: Not actually hooked up on Sandybridge. */
 255    /* A14: 11 */ COUNTER("Pixel Shader Stall Time - Core Stall"),
 256    /* A15: 12 */ COUNTER("# PS threads loaded"),
 257    /* A16: 13 */ COUNTER("Pixel Shader Ready but not running Time"),
 258    /* A17: 14 */ COUNTER("Early Z Test Pixels Passing"),
 259    /* A18: 15 */ COUNTER("Early Z Test Pixels Failing"),
 260    /* A19: 16 */ COUNTER("Early Stencil Test Pixels Passing"),
 261    /* A20: 17 */ COUNTER("Early Stencil Test Pixels Failing"),
 262    /* A21: 18 */ COUNTER("Pixel Kill Count"),
 263    /* A22: 19 */ COUNTER("Alpha Test Pixels Failed"),
 264    /* A23: 20 */ COUNTER("Post PS Stencil Pixels Failed"),
 265    /* A24: 21 */ COUNTER("Post PS Z buffer Pixels Failed"),
 266    /* A25: 22 */ COUNTER("Pixels/samples Written in the frame buffer"),
 267    /* A26: 23 */ COUNTER("GPU Busy"),
 268    /* A27: 24 */ COUNTER("CL active and not stalled"),
 269    /* A28: 25 */ COUNTER("SF active and stalled"),
 270 };
 271
 272 /**
 273  * Sandybridge: Counter Select = 001
 274  * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
 275  * A5   A6   A7   A8   A9   A10  A11  A12
 276  * A13  A14  A15  A16  A17  A18  A19  A20
 277  * A21  A22  A23  A24  A25  A26  A27  A28
 278  *
 279  * (Yes, this is a strange order.)  We also have to remap for missing counters.
 280  */
 281 const static int gen6_oa_snapshot_layout[] =
 282 {
 283    -1, /* Report ID */
 284    -1, /* TIMESTAMP (64-bit) */
 285    -1, /* ...second half... */
 286     3, /* A4:  Vertex Shader Stall Time - Core Stall */
 287    -1, /* A3:  (not available) */
 288     2, /* A2:  Vertex Shader Active Time */
 289     1, /* A1:  Aggregated Core Array Stalled */
 290     0, /* A0:  Aggregated Core Array Active */
 291    10, /* A12: Pixel Shader Active Time */
 292     9, /* A11: Geometry Shader ready but not running Time */
 293     8, /* A10: # GS threads loaded */
 294     7, /* A9:  Geometry Shader Stall Time - Core Stall */
 295    -1, /* A8:  (not available) */
 296     6, /* A7:  Geometry Shader Active Time */
 297     5, /* A6:  Vertex Shader ready but not running Time */
 298     4, /* A5:  # VS Threads Loaded */
 299    17, /* A20: Early Stencil Test Pixels Failing */
 300    16, /* A19: Early Stencil Test Pixels Passing */
 301    15, /* A18: Early Z Test Pixels Failing */
 302    14, /* A17: Early Z Test Pixels Passing */
 303    13, /* A16: Pixel Shader ready but not running Time */
 304    12, /* A15: # PS threads loaded */
 305    11, /* A14: Pixel Shader Stall Time - Core Stall */
 306    -1, /* A13: (not available) */
 307    25, /* A28: SF active and stalled */
 308    24, /* A27: CL active and not stalled */
 309    23, /* A26: GPU Busy */
 310    22, /* A25: Pixels/samples Written in the frame buffer */
 311    21, /* A24: Post PS Z buffer Pixels Failed */
 312    20, /* A23: Post PS Stencil Pixels Failed */
 313    19, /* A22: Alpha Test Pixels Failed */
 314    18, /* A21: Pixel Kill Count */
 315 };
 316
 317 const static struct gl_perf_monitor_counter gen6_statistics_counters[] = {
 318    COUNTER64("IA_VERTICES_COUNT"),
 319    COUNTER64("IA_PRIMITIVES_COUNT"),
 320    COUNTER64("VS_INVOCATION_COUNT"),
 321    COUNTER64("GS_INVOCATION_COUNT"),
 322    COUNTER64("GS_PRIMITIVES_COUNT"),
 323    COUNTER64("CL_INVOCATION_COUNT"),
 324    COUNTER64("CL_PRIMITIVES_COUNT"),
 325    COUNTER64("PS_INVOCATION_COUNT"),
 326    COUNTER64("PS_DEPTH_COUNT"),
 327    COUNTER64("SO_NUM_PRIMS_WRITTEN"),
 328    COUNTER64("SO_PRIM_STORAGE_NEEDED"),
 329 };
 330
 331 /** MMIO register addresses for each pipeline statistics counter. */
 332 const static int gen6_statistics_register_addresses[] = {
 333    IA_VERTICES_COUNT,
 334    IA_PRIMITIVES_COUNT,
 335    VS_INVOCATION_COUNT,
 336    GS_INVOCATION_COUNT,
 337    GS_PRIMITIVES_COUNT,
 338    CL_INVOCATION_COUNT,
 339    CL_PRIMITIVES_COUNT,
 340    PS_INVOCATION_COUNT,
 341    PS_DEPTH_COUNT,
 342    GEN6_SO_NUM_PRIMS_WRITTEN,
 343    GEN6_SO_PRIM_STORAGE_NEEDED,
 344 };
 345
 346 const static struct gl_perf_monitor_group gen6_groups[] = {
 347    GROUP("Observability Architecture Counters", INT_MAX, gen6_raw_oa_counters),
 348    GROUP("Pipeline Statistics Registers", INT_MAX, gen6_statistics_counters),
 349 };
 350 /** @} */
 351
 352 /**
 353  * Ivybridge/Baytrail/Haswell:
 354  *  @{
 355  */
 356 const static struct gl_perf_monitor_counter gen7_raw_oa_counters[] = {
 357    COUNTER("Aggregated Core Array Active"),
 358    COUNTER("Aggregated Core Array Stalled"),
 359    COUNTER("Vertex Shader Active Time"),
 360    COUNTER("Vertex Shader Stall Time - Core Stall"),
 361    COUNTER("# VS threads loaded"),
 362    COUNTER("Hull Shader Active Time"),
 363    COUNTER("Hull Shader Stall Time - Core Stall"),
 364    COUNTER("# HS threads loaded"),
 365    COUNTER("Domain Shader Active Time"),
 366    COUNTER("Domain Shader Stall Time - Core Stall"),
 367    COUNTER("# DS threads loaded"),
 368    COUNTER("Compute Shader Active Time"),
 369    COUNTER("Compute Shader Stall Time - Core Stall"),
 370    COUNTER("# CS threads loaded"),
 371    COUNTER("Geometry Shader Active Time"),
 372    COUNTER("Geometry Shader Stall Time - Core Stall"),
 373    COUNTER("# GS threads loaded"),
 374    COUNTER("Pixel Shader Active Time"),
 375    COUNTER("Pixel Shader Stall Time - Core Stall"),
 376    COUNTER("# PS threads loaded"),
 377    COUNTER("HiZ Fast Z Test Pixels Passing"),
 378    COUNTER("HiZ Fast Z Test Pixels Failing"),
 379    COUNTER("Slow Z Test Pixels Passing"),
 380    COUNTER("Slow Z Test Pixels Failing"),
 381    COUNTER("Pixel Kill Count"),
 382    COUNTER("Alpha Test Pixels Failed"),
 383    COUNTER("Post PS Stencil Pixels Failed"),
 384    COUNTER("Post PS Z buffer Pixels Failed"),
 385    COUNTER("3D/GPGPU Render Target Writes"),
 386    COUNTER("Render Engine Busy"),
 387    COUNTER("VS bottleneck"),
 388    COUNTER("GS bottleneck"),
 389 };
 390
 391 /**
 392  * Ivybridge/Baytrail/Haswell: Counter Select = 101
 393  * A4   A3   A2   A1   A0   TIMESTAMP  ReportID
 394  * A12  A11  A10  A9   A8   A7   A6    A5
 395  * A20  A19  A18  A17  A16  A15  A14   A13
 396  * A28  A27  A26  A25  A24  A23  A22   A21
 397  * A36  A35  A34  A33  A32  A31  A30   A29
 398  * A44  A43  A42  A41  A40  A39  A38   A37
 399  * B7   B6   B5   B4   B3   B2   B1    B0
 400  * Rsv  Rsv  Rsv  Rsv  Rsv  Rsv  Rsv   Rsv
 401  */
 402 const static int gen7_oa_snapshot_layout[] =
 403 {
 404    -1, /* Report ID */
 405    -1, /* TIMESTAMP (64-bit) */
 406    -1, /* ...second half... */
 407     0, /* A0:  Aggregated Core Array Active */
 408     1, /* A1:  Aggregated Core Array Stalled */
 409     2, /* A2:  Vertex Shader Active Time */
 410    -1, /* A3:  Reserved */
 411     3, /* A4:  Vertex Shader Stall Time - Core Stall */
 412     4, /* A5:  # VS threads loaded */
 413    -1, /* A6:  Reserved */
 414     5, /* A7:  Hull Shader Active Time */
 415    -1, /* A8:  Reserved */
 416     6, /* A9:  Hull Shader Stall Time - Core Stall */
 417     7, /* A10: # HS threads loaded */
 418    -1, /* A11: Reserved */
 419     8, /* A12: Domain Shader Active Time */
 420    -1, /* A13: Reserved */
 421     9, /* A14: Domain Shader Stall Time - Core Stall */
 422    10, /* A15: # DS threads loaded */
 423    -1, /* A16: Reserved */
 424    11, /* A17: Compute Shader Active Time */
 425    -1, /* A18: Reserved */
 426    12, /* A19: Compute Shader Stall Time - Core Stall */
 427    13, /* A20: # CS threads loaded */
 428    -1, /* A21: Reserved */
 429    14, /* A22: Geometry Shader Active Time */
 430    -1, /* A23: Reserved */
 431    15, /* A24: Geometry Shader Stall Time - Core Stall */
 432    16, /* A25: # GS threads loaded */
 433    -1, /* A26: Reserved */
 434    17, /* A27: Pixel Shader Active Time */
 435    -1, /* A28: Reserved */
 436    18, /* A29: Pixel Shader Stall Time - Core Stall */
 437    19, /* A30: # PS threads loaded */
 438    -1, /* A31: Reserved */
 439    20, /* A32: HiZ Fast Z Test Pixels Passing */
 440    21, /* A33: HiZ Fast Z Test Pixels Failing */
 441    22, /* A34: Slow Z Test Pixels Passing */
 442    23, /* A35: Slow Z Test Pixels Failing */
 443    24, /* A36: Pixel Kill Count */
 444    25, /* A37: Alpha Test Pixels Failed */
 445    26, /* A38: Post PS Stencil Pixels Failed */
 446    27, /* A39: Post PS Z buffer Pixels Failed */
 447    28, /* A40: 3D/GPGPU Render Target Writes */
 448    29, /* A41: Render Engine Busy */
 449    30, /* A42: VS bottleneck */
 450    31, /* A43: GS bottleneck */
 451    -1, /* A44: Reserved */
 452    -1, /* B0 */
 453    -1, /* B1 */
 454    -1, /* B2 */
 455    -1, /* B3 */
 456    -1, /* B4 */
 457    -1, /* B5 */
 458    -1, /* B6 */
 459    -1, /* B7 */
 460    -1, /* Reserved */
 461    -1, /* Reserved */
 462    -1, /* Reserved */
 463    -1, /* Reserved */
 464    -1, /* Reserved */
 465    -1, /* Reserved */
 466    -1, /* Reserved */
 467    -1, /* Reserved */
 468 };
 469
 470 const static struct gl_perf_monitor_counter gen7_statistics_counters[] = {
 471    COUNTER64("IA_VERTICES_COUNT"),
 472    COUNTER64("IA_PRIMITIVES_COUNT"),
 473    COUNTER64("VS_INVOCATION_COUNT"),
 474    COUNTER64("HS_INVOCATION_COUNT"),
 475    COUNTER64("DS_INVOCATION_COUNT"),
 476    COUNTER64("GS_INVOCATION_COUNT"),
 477    COUNTER64("GS_PRIMITIVES_COUNT"),
 478    COUNTER64("CL_INVOCATION_COUNT"),
 479    COUNTER64("CL_PRIMITIVES_COUNT"),
 480    COUNTER64("PS_INVOCATION_COUNT"),
 481    COUNTER64("PS_DEPTH_COUNT"),
 482    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 0)"),
 483    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 1)"),
 484    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 2)"),
 485    COUNTER64("SO_NUM_PRIMS_WRITTEN (Stream 3)"),
 486    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 0)"),
 487    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 1)"),
 488    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 2)"),
 489    COUNTER64("SO_PRIM_STORAGE_NEEDED (Stream 3)"),
 490 };
 491
 492 /** MMIO register addresses for each pipeline statistics counter. */
 493 const static int gen7_statistics_register_addresses[] = {
 494    IA_VERTICES_COUNT,
 495    IA_PRIMITIVES_COUNT,
 496    VS_INVOCATION_COUNT,
 497    HS_INVOCATION_COUNT,
 498    DS_INVOCATION_COUNT,
 499    GS_INVOCATION_COUNT,
 500    GS_PRIMITIVES_COUNT,
 501    CL_INVOCATION_COUNT,
 502    CL_PRIMITIVES_COUNT,
 503    PS_INVOCATION_COUNT,
 504    PS_DEPTH_COUNT,
 505    GEN7_SO_NUM_PRIMS_WRITTEN(0),
 506    GEN7_SO_NUM_PRIMS_WRITTEN(1),
 507    GEN7_SO_NUM_PRIMS_WRITTEN(2),
 508    GEN7_SO_NUM_PRIMS_WRITTEN(3),
 509    GEN7_SO_PRIM_STORAGE_NEEDED(0),
 510    GEN7_SO_PRIM_STORAGE_NEEDED(1),
 511    GEN7_SO_PRIM_STORAGE_NEEDED(2),
 512    GEN7_SO_PRIM_STORAGE_NEEDED(3),
 513 };
 514
 515 const static struct gl_perf_monitor_group gen7_groups[] = {
 516    GROUP("Observability Architecture Counters", INT_MAX, gen7_raw_oa_counters),
 517    GROUP("Pipeline Statistics Registers", INT_MAX, gen7_statistics_counters),
 518 };
 519 /** @} */
 520
 521 /******************************************************************************/
 522
 523 static GLboolean brw_is_perf_monitor_result_available(struct gl_context *, struct gl_perf_monitor_object *);
 524
 525 static void
 526 dump_perf_monitor_callback(GLuint name, void *monitor_void, void *brw_void)
 527 {
 528    struct brw_context *brw = brw_void;
 529    struct gl_context *ctx = brw_void;
 530    struct gl_perf_monitor_object *m = monitor_void;
 531    struct brw_perf_monitor_object *monitor = monitor_void;
 532
 533    const char *resolved = "";
 534    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
 535       if (brw->perfmon.unresolved[i] == monitor) {
 536          resolved = "Unresolved";
 537          break;
 538       }
 539    }
 540
 541    DBG("%4d  %-7s %-6s %-10s %-11s <%3d, %3d, %3d>  %-6s %-9s\n",
 542        name,
 543        m->Active ? "Active" : "",
 544        m->Ended ? "Ended" : "",
 545        resolved,
 546        brw_is_perf_monitor_result_available(ctx, m) ? "Available" : "",
 547        monitor->oa_head_end,
 548        monitor->oa_middle_start,
 549        monitor->oa_tail_start,
 550        monitor->oa_bo ? "OA BO" : "",
 551        monitor->pipeline_stats_bo ? "Stats BO" : "");
 552 }
 553
 554 void
 555 brw_dump_perf_monitors(struct brw_context *brw)
 556 {
 557    struct gl_context *ctx = &brw->ctx;
 558    DBG("Monitors: (OA users = %d)\n", brw->perfmon.oa_users);
 559    _mesa_HashWalk(ctx->PerfMonitor.Monitors, dump_perf_monitor_callback, brw);
 560 }
 561
 562 /******************************************************************************/
 563
 564 static bool
 565 monitor_needs_statistics_registers(struct brw_context *brw,
 566                                    struct gl_perf_monitor_object *m)
 567 {
 568    return brw->gen >= 6 && m->ActiveGroups[PIPELINE_STATS_COUNTERS];
 569 }
 570
 571 /**
 572  * Take a snapshot of any monitored pipeline statistics counters.
 573  */
 574 static void
 575 snapshot_statistics_registers(struct brw_context *brw,
 576                               struct brw_perf_monitor_object *monitor,
 577                               uint32_t offset_in_bytes)
 578 {
 579    struct gl_context *ctx = &brw->ctx;
 580    const int offset = offset_in_bytes / sizeof(uint64_t);
 581    const int group = PIPELINE_STATS_COUNTERS;
 582    const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
 583
 584    intel_batchbuffer_emit_mi_flush(brw);
 585
 586    for (int i = 0; i < num_counters; i++) {
 587       if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
 588          assert(ctx->PerfMonitor.Groups[group].Counters[i].Type ==
 589                 GL_UNSIGNED_INT64_AMD);
 590
 591          brw_store_register_mem64(brw, monitor->pipeline_stats_bo,
 592                                   brw->perfmon.statistics_registers[i],
 593                                   offset + i);
 594       }
 595    }
 596 }
 597
 598 /**
 599  * Gather results from pipeline_stats_bo, storing the final values.
 600  *
 601  * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
 602  * smaller array of final results.
 603  */
 604 static void
 605 gather_statistics_results(struct brw_context *brw,
 606                           struct brw_perf_monitor_object *monitor)
 607 {
 608    struct gl_context *ctx = &brw->ctx;
 609    const int num_counters =
 610       ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
 611
 612    monitor->pipeline_stats_results = calloc(num_counters, sizeof(uint64_t));
 613
 614    drm_intel_bo_map(monitor->pipeline_stats_bo, false);
 615    uint64_t *start = monitor->pipeline_stats_bo->virtual;
 616    uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
 617
 618    for (int i = 0; i < num_counters; i++) {
 619       monitor->pipeline_stats_results[i] = end[i] - start[i];
 620    }
 621    drm_intel_bo_unmap(monitor->pipeline_stats_bo);
 622    drm_intel_bo_unreference(monitor->pipeline_stats_bo);
 623    monitor->pipeline_stats_bo = NULL;
 624 }
 625
 626 /******************************************************************************/
 627
 628 static bool
 629 monitor_needs_oa(struct brw_context *brw,
 630                  struct gl_perf_monitor_object *m)
 631 {
 632    return m->ActiveGroups[OA_COUNTERS];
 633 }
 634
 635 /**
 636  * Enable the Observability Architecture counters by whacking OACONTROL.
 637  */
 638 static void
 639 start_oa_counters(struct brw_context *brw)
 640 {
 641    unsigned counter_format;
 642
 643    /* Pick the counter format which gives us all the counters. */
 644    switch (brw->gen) {
 645    case 5:
 646       return; /* Ironlake counters are always running. */
 647    case 6:
 648       counter_format = 1; /* 0b001 */
 649       break;
 650    case 7:
 651       counter_format = 5; /* 0b101 */
 652       break;
 653    default:
 654       assert(!"Tried to enable OA counters on an unsupported generation.");
 655       return;
 656    }
 657
 658    BEGIN_BATCH(3);
 659    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 660    OUT_BATCH(OACONTROL);
 661    OUT_BATCH(counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
 662              OACONTROL_ENABLE_COUNTERS);
 663    ADVANCE_BATCH();
 664 }
 665
 666 /**
 667  * Disable OA counters.
 668  */
 669 static void
 670 stop_oa_counters(struct brw_context *brw)
 671 {
 672    /* Ironlake counters never stop. */
 673    if (brw->gen == 5)
 674       return;
 675
 676    BEGIN_BATCH(3);
 677    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 678    OUT_BATCH(OACONTROL);
 679    OUT_BATCH(0);
 680    ADVANCE_BATCH();
 681 }
 682
 683 /**
 684  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
 685  * including the required PIPE_CONTROL flushes.
 686  *
 687  * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
 688  * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
 689  * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
 690  * the 3 DWords for MI_REPORT_PERF_COUNT itself.
 691  */
 692 #define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
 693
 694 /**
 695  * Emit an MI_REPORT_PERF_COUNT command packet.
 696  *
 697  * This writes the current OA counter values to buffer.
 698  */
 699 static void
 700 emit_mi_report_perf_count(struct brw_context *brw,
 701                           drm_intel_bo *bo,
 702                           uint32_t offset_in_bytes,
 703                           uint32_t report_id)
 704 {
 705    assert(offset_in_bytes % 64 == 0);
 706
 707    /* Make sure the commands to take a snapshot fits in a single batch. */
 708    intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
 709                                    RENDER_RING);
 710    int batch_used = brw->batch.used;
 711
 712    /* Reports apparently don't always get written unless we flush first. */
 713    intel_batchbuffer_emit_mi_flush(brw);
 714
 715    if (brw->gen == 5) {
 716       /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
 717        * the counters.  The report ID is ignored in the second set.
 718        */
 719       BEGIN_BATCH(6);
 720       OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
 721       OUT_RELOC(bo,
 722                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 723                 offset_in_bytes);
 724       OUT_BATCH(report_id);
 725
 726       OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
 727       OUT_RELOC(bo,
 728                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 729                 offset_in_bytes + 64);
 730       OUT_BATCH(report_id);
 731       ADVANCE_BATCH();
 732    } else if (brw->gen == 6) {
 733       BEGIN_BATCH(3);
 734       OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
 735       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 736                 offset_in_bytes | MI_COUNTER_ADDRESS_GTT);
 737       OUT_BATCH(report_id);
 738       ADVANCE_BATCH();
 739    } else if (brw->gen == 7) {
 740       BEGIN_BATCH(3);
 741       OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
 742       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 743                 offset_in_bytes);
 744       OUT_BATCH(report_id);
 745       ADVANCE_BATCH();
 746    } else {
 747       assert(!"Unsupported generation for performance counters.");
 748    }
 749
 750    /* Reports apparently don't always get written unless we flush after. */
 751    intel_batchbuffer_emit_mi_flush(brw);
 752
 753    (void) batch_used;
 754    assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
 755 }
 756
 757 /**
 758  * Add a monitor to the global list of "unresolved monitors."
 759  *
 760  * Monitors are "unresolved" if they refer to OA counter snapshots in
 761  * bookend_bo.  Results (even partial ones) must be gathered for all
 762  * unresolved monitors before it's safe to discard bookend_bo.
 763  */
 764 static void
 765 add_to_unresolved_monitor_list(struct brw_context *brw,
 766                                struct brw_perf_monitor_object *monitor)
 767 {
 768    if (brw->perfmon.unresolved_elements >=
 769        brw->perfmon.unresolved_array_size) {
 770       brw->perfmon.unresolved_array_size *= 2;
 771       brw->perfmon.unresolved = reralloc(brw, brw->perfmon.unresolved,
 772                                          struct brw_perf_monitor_object *,
 773                                          brw->perfmon.unresolved_array_size);
 774    }
 775
 776    brw->perfmon.unresolved[brw->perfmon.unresolved_elements++] = monitor;
 777 }
 778
 779 /**
 780  * If possible, throw away the contents of bookend BO.
 781  *
 782  * When all monitoring stops, and no monitors need data from bookend_bo to
 783  * compute results, we can discard it and start writing snapshots at the
 784  * beginning again.  This helps reduce the amount of buffer wraparound.
 785  */
 786 static void
 787 clean_bookend_bo(struct brw_context *brw)
 788 {
 789    if (brw->perfmon.unresolved_elements == 0) {
 790       DBG("***Resetting bookend snapshots to 0\n");
 791       brw->perfmon.bookend_snapshots = 0;
 792    }
 793 }
 794
 795 /**
 796  * Remove a monitor from the global list of "unresolved monitors."
 797  *
 798  * This can happen when:
 799  * - We finish computing a completed monitor's results.
 800  * - We discard unwanted monitor results.
 801  * - A monitor's results can be computed without relying on bookend_bo.
 802  */
 803 static void
 804 drop_from_unresolved_monitor_list(struct brw_context *brw,
 805                                   struct brw_perf_monitor_object *monitor)
 806 {
 807    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
 808       if (brw->perfmon.unresolved[i] == monitor) {
 809          int last_elt = --brw->perfmon.unresolved_elements;
 810
 811          if (i == last_elt) {
 812             brw->perfmon.unresolved[i] = NULL;
 813          } else {
 814             brw->perfmon.unresolved[i] = brw->perfmon.unresolved[last_elt];
 815          }
 816
 817          clean_bookend_bo(brw);
 818          return;
 819       }
 820    }
 821 }
 822
 823 /**
 824  * Given pointers to starting and ending OA snapshots, add the deltas for each
 825  * counter to the results.
 826  */
 827 static void
 828 add_deltas(struct brw_context *brw,
 829            struct brw_perf_monitor_object *monitor,
 830            uint32_t *start, uint32_t *end)
 831 {
 832    /* Look for expected report ID values to ensure data is present. */
 833    assert(start[0] == REPORT_ID);
 834    assert(end[0] == REPORT_ID);
 835
 836    /* Subtract each counter's ending and starting values, then add the
 837     * difference to the counter's value so far.
 838     */
 839    for (int i = 3; i < brw->perfmon.entries_per_oa_snapshot; i++) {
 840       /* When debugging, it's useful to note when the ending value is less than
 841        * the starting value; aggregating counters should always increase in
 842        * value (or remain unchanged).  This happens periodically due to
 843        * wraparound, but can also indicate serious problems.
 844        */
 845 #ifdef DEBUG
 846       if (end[i] < start[i]) {
 847          int counter = brw->perfmon.oa_snapshot_layout[i];
 848          if (counter >= 0) {
 849             DBG("WARNING: \"%s\" ending value was less than the starting "
 850                 "value: %u < %u (end - start = %u)\n",
 851                 brw->ctx.PerfMonitor.Groups[0].Counters[counter].Name,
 852                 end[i], start[i], end[i] - start[i]);
 853          }
 854       }
 855 #endif
 856       monitor->oa_results[i] += end[i] - start[i];
 857    }
 858 }
 859
 860 /**
 861  * Gather OA counter results (partial or full) from a series of snapshots.
 862  *
 863  * Monitoring can start or stop at any time, likely at some point mid-batch.
 864  * We write snapshots for both events, storing them in monitor->oa_bo.
 865  *
 866  * Ideally, we would simply subtract those two snapshots to obtain the final
 867  * counter results.  Unfortunately, our hardware doesn't preserve their values
 868  * across context switches or GPU sleep states.  In order to support multiple
 869  * concurrent OA clients, as well as reliable data across power management,
 870  * we have to take snapshots at the start and end of batches as well.
 871  *
 872  * This results in a three-part sequence of (start, end) intervals:
 873  * - The "head" is from the BeginPerfMonitor snapshot to the end of the first
 874  *   batchbuffer.
 875  * - The "middle" is a series of (batch start, batch end) snapshots which
 876  *   bookend any batchbuffers between the ones which start/end monitoring.
 877  * - The "tail" is from the start of the last batch where monitoring was
 878  *   active to the EndPerfMonitor snapshot.
 879  *
 880  * Due to wrapping in the bookend BO, we may have to accumulate partial results.
 881  * If so, we handle the "head" and any "middle" results so far.  When monitoring
 882  * eventually ends, we handle additional "middle" batches and the "tail."
 883  */
 884 static void
 885 gather_oa_results(struct brw_context *brw,
 886                   struct brw_perf_monitor_object *monitor,
 887                   uint32_t *bookend_buffer)
 888 {
 889    struct gl_perf_monitor_object *m = &monitor->base;
 890    assert(monitor->oa_bo != NULL);
 891
 892    drm_intel_bo_map(monitor->oa_bo, false);
 893    uint32_t *monitor_buffer = monitor->oa_bo->virtual;
 894
 895    /* If monitoring was entirely contained within a single batch, then the
 896     * bookend BO is irrelevant.  Just subtract monitor->bo's two snapshots.
 897     */
 898    if (monitor->oa_middle_start == -1) {
 899       add_deltas(brw, monitor,
 900                  monitor_buffer,
 901                  monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
 902                                    sizeof(uint32_t)));
 903       drm_intel_bo_unmap(monitor->oa_bo);
 904       return;
 905    }
 906
 907    const int snapshot_size = brw->perfmon.entries_per_oa_snapshot;
 908
 909    /* First, add the contributions from the "head" interval:
 910     * (snapshot taken at BeginPerfMonitor time,
 911     *  snapshot taken at the end of the first batch after monitoring began)
 912     */
 913    if (monitor->oa_head_end != -1) {
 914       assert(monitor->oa_head_end < brw->perfmon.bookend_snapshots);
 915       add_deltas(brw, monitor,
 916                  monitor_buffer,
 917                  bookend_buffer + snapshot_size * monitor->oa_head_end);
 918
 919       /* Make sure we don't count the "head" again in the future. */
 920       monitor->oa_head_end = -1;
 921    }
 922
 923    /* Next, count the contributions from the "middle" batches.  These are
 924     * (batch begin, batch end) deltas while monitoring was active.
 925     */
 926    int last_snapshot;
 927    if (m->Ended)
 928       last_snapshot = monitor->oa_tail_start;
 929    else
 930       last_snapshot = brw->perfmon.bookend_snapshots;
 931
 932    for (int s = monitor->oa_middle_start; s < last_snapshot; s += 2) {
 933       add_deltas(brw, monitor,
 934                  bookend_buffer + snapshot_size * s,
 935                  bookend_buffer + snapshot_size * (s + 1));
 936    }
 937
 938    /* Finally, if the monitor has ended, we need to count the contributions of
 939     * the "tail" interval:
 940     * (start of the batch where monitoring ended, EndPerfMonitor snapshot)
 941     */
 942    if (m->Ended) {
 943       assert(monitor->oa_tail_start != -1);
 944       add_deltas(brw, monitor,
 945                  bookend_buffer + snapshot_size * monitor->oa_tail_start,
 946                  monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
 947                                    sizeof(uint32_t)));
 948    }
 949
 950    drm_intel_bo_unmap(monitor->oa_bo);
 951
 952    /* If the monitor has ended, then we've gathered all the results, and
 953     * can free the monitor's OA BO.
 954     */
 955    if (m->Ended) {
 956       drm_intel_bo_unreference(monitor->oa_bo);
 957       monitor->oa_bo = NULL;
 958
 959       /* The monitor's OA result is now resolved. */
 960       DBG("Marking %d resolved - results gathered\n", m->Name);
 961       drop_from_unresolved_monitor_list(brw, monitor);
 962    }
 963 }
 964
 965 /**
 966  * Handle running out of space in the bookend BO.
 967  *
 968  * When we run out of space in the bookend BO, we need to gather up partial
 969  * results for every unresolved monitor.  This allows us to free the snapshot
 970  * data in bookend_bo, freeing up the space for reuse.  We call this "wrapping."
 971  *
 972  * This will completely compute the result for any unresolved monitors that
 973  * have ended.
 974  */
 975 static void
 976 wrap_bookend_bo(struct brw_context *brw)
 977 {
 978    DBG("****Wrap bookend BO****\n");
 979    /* Note that wrapping will only occur at the start of a batch, since that's
 980     * where we reserve space.  So the current batch won't reference bookend_bo
 981     * or any monitor BOs.  This means we don't need to worry about
 982     * synchronization.
 983     *
 984     * Also, EndPerfMonitor guarantees that only monitors which span multiple
 985     * batches exist in the unresolved monitor list.
 986     */
 987    assert(brw->perfmon.oa_users > 0);
 988
 989    drm_intel_bo_map(brw->perfmon.bookend_bo, false);
 990    uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
 991    for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
 992       struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
 993       struct gl_perf_monitor_object *m = &monitor->base;
 994
 995       gather_oa_results(brw, monitor, bookend_buffer);
 996
 997       if (m->Ended) {
 998          /* gather_oa_results() dropped the monitor from the unresolved list,
 999           * throwing our indices off by one.
1000           */
1001          --i;
1002       } else {
1003          /* When we create the new bookend_bo, snapshot #0 will be the
1004           * beginning of another "middle" BO.
1005           */
1006          monitor->oa_middle_start = 0;
1007          assert(monitor->oa_head_end == -1);
1008          assert(monitor->oa_tail_start == -1);
1009       }
1010    }
1011    drm_intel_bo_unmap(brw->perfmon.bookend_bo);
1012
1013    brw->perfmon.bookend_snapshots = 0;
1014 }
1015
1016 /* This is fairly arbitrary; the trade off is memory usage vs. extra overhead
1017  * from wrapping.  On Gen7, 32768 should be enough for for 128 snapshots before
1018  * wrapping (since each is 256 bytes).
1019  */
1020 #define BOOKEND_BO_SIZE_BYTES 32768
1021
1022 /**
1023  * Check whether bookend_bo has space for a given number of snapshots.
1024  */
1025 static bool
1026 has_space_for_bookend_snapshots(struct brw_context *brw, int snapshots)
1027 {
1028    int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
1029
1030    /* There are brw->perfmon.bookend_snapshots - 1 existing snapshots. */
1031    int total_snapshots = (brw->perfmon.bookend_snapshots - 1) + snapshots;
1032
1033    return total_snapshots * snapshot_bytes < BOOKEND_BO_SIZE_BYTES;
1034 }
1035
1036 /**
1037  * Write an OA counter snapshot to bookend_bo.
1038  */
1039 static void
1040 emit_bookend_snapshot(struct brw_context *brw)
1041 {
1042    int snapshot_bytes = brw->perfmon.entries_per_oa_snapshot * sizeof(uint32_t);
1043    int offset_in_bytes = brw->perfmon.bookend_snapshots * snapshot_bytes;
1044
1045    emit_mi_report_perf_count(brw, brw->perfmon.bookend_bo, offset_in_bytes,
1046                              REPORT_ID);
1047    ++brw->perfmon.bookend_snapshots;
1048 }
1049
1050 /******************************************************************************/
1051
1052 /**
1053  * Initialize a monitor to sane starting state; throw away old buffers.
1054  */
1055 static void
1056 reinitialize_perf_monitor(struct brw_context *brw,
1057                           struct brw_perf_monitor_object *monitor)
1058 {
1059    if (monitor->oa_bo) {
1060       drm_intel_bo_unreference(monitor->oa_bo);
1061       monitor->oa_bo = NULL;
1062    }
1063
1064    /* Since the results are now invalid, we don't need to hold on to any
1065     * snapshots in bookend_bo.  The monitor is effectively "resolved."
1066     */
1067    drop_from_unresolved_monitor_list(brw, monitor);
1068
1069    monitor->oa_head_end = -1;
1070    monitor->oa_middle_start = -1;
1071    monitor->oa_tail_start = -1;
1072
1073    free(monitor->oa_results);
1074    monitor->oa_results = NULL;
1075
1076    if (monitor->pipeline_stats_bo) {
1077       drm_intel_bo_unreference(monitor->pipeline_stats_bo);
1078       monitor->pipeline_stats_bo = NULL;
1079    }
1080
1081    free(monitor->pipeline_stats_results);
1082    monitor->pipeline_stats_results = NULL;
1083 }
1084
1085 /**
1086  * Driver hook for glBeginPerformanceMonitorAMD().
1087  */
1088 static GLboolean
1089 brw_begin_perf_monitor(struct gl_context *ctx,
1090                        struct gl_perf_monitor_object *m)
1091 {
1092    struct brw_context *brw = brw_context(ctx);
1093    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1094
1095    DBG("Begin(%d)\n", m->Name);
1096
1097    reinitialize_perf_monitor(brw, monitor);
1098
1099    if (monitor_needs_oa(brw, m)) {
1100       /* If the global OA bookend BO doesn't exist, allocate it.  This should
1101        * only happen once, but we delay until BeginPerfMonitor time to avoid
1102        * wasting memory for contexts that don't use performance monitors.
1103        */
1104       if (!brw->perfmon.bookend_bo) {
1105          brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
1106                                                       "OA bookend BO",
1107                                                       BOOKEND_BO_SIZE_BYTES, 64);
1108       }
1109
1110       monitor->oa_bo =
1111          drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
1112 #ifdef DEBUG
1113       /* Pre-filling the BO helps debug whether writes landed. */
1114       drm_intel_bo_map(monitor->oa_bo, true);
1115       memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
1116       drm_intel_bo_unmap(monitor->oa_bo);
1117 #endif
1118
1119       /* Allocate storage for accumulated OA counter values. */
1120       monitor->oa_results =
1121          calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
1122
1123       /* If the OA counters aren't already on, enable them. */
1124       if (brw->perfmon.oa_users == 0) {
1125          /* Ensure the OACONTROL enable and snapshot land in the same batch. */
1126          int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
1127          intel_batchbuffer_require_space(brw, space, RENDER_RING);
1128          start_oa_counters(brw);
1129       }
1130
1131       /* Take a starting OA counter snapshot. */
1132       emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
1133
1134       monitor->oa_head_end = brw->perfmon.bookend_snapshots;
1135       monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
1136       monitor->oa_tail_start = -1;
1137
1138       /* Add the monitor to the unresolved list. */
1139       add_to_unresolved_monitor_list(brw, monitor);
1140
1141       ++brw->perfmon.oa_users;
1142    }
1143
1144    if (monitor_needs_statistics_registers(brw, m)) {
1145       monitor->pipeline_stats_bo =
1146          drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
1147
1148       /* Take starting snapshots. */
1149       snapshot_statistics_registers(brw, monitor, 0);
1150    }
1151
1152    return true;
1153 }
1154
1155 /**
1156  * Driver hook for glEndPerformanceMonitorAMD().
1157  */
1158 static void
1159 brw_end_perf_monitor(struct gl_context *ctx,
1160                      struct gl_perf_monitor_object *m)
1161 {
1162    struct brw_context *brw = brw_context(ctx);
1163    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1164
1165    DBG("End(%d)\n", m->Name);
1166
1167    if (monitor_needs_oa(brw, m)) {
1168       /* Take an ending OA counter snapshot. */
1169       emit_mi_report_perf_count(brw, monitor->oa_bo,
1170                                 SECOND_SNAPSHOT_OFFSET_IN_BYTES, REPORT_ID);
1171
1172       --brw->perfmon.oa_users;
1173
1174       if (brw->perfmon.oa_users == 0)
1175          stop_oa_counters(brw);
1176
1177       if (monitor->oa_head_end == brw->perfmon.bookend_snapshots) {
1178          assert(monitor->oa_head_end != -1);
1179          /* We never actually wrote the snapshot for the end of the first batch
1180           * after BeginPerfMonitor.  This means that monitoring was contained
1181           * entirely within a single batch, so we can ignore bookend_bo and
1182           * just compare the monitor's begin/end snapshots directly.
1183           */
1184          monitor->oa_head_end = -1;
1185          monitor->oa_middle_start = -1;
1186          monitor->oa_tail_start = -1;
1187
1188          /* We can also mark it resolved since it won't depend on bookend_bo. */
1189          DBG("Marking %d resolved - entirely in one batch\n", m->Name);
1190          drop_from_unresolved_monitor_list(brw, monitor);
1191       } else {
1192          /* We've written at least one batch end snapshot, so the monitoring
1193           * spanned multiple batches.  Mark which snapshot corresponds to the
1194           * start of the current batch.
1195           */
1196          monitor->oa_tail_start = brw->perfmon.bookend_snapshots - 1;
1197       }
1198    }
1199
1200    if (monitor_needs_statistics_registers(brw, m)) {
1201       /* Take ending snapshots. */
1202       snapshot_statistics_registers(brw, monitor,
1203                                     SECOND_SNAPSHOT_OFFSET_IN_BYTES);
1204    }
1205 }
1206
1207 /**
1208  * Reset a performance monitor, throwing away any results.
1209  */
1210 static void
1211 brw_reset_perf_monitor(struct gl_context *ctx,
1212                        struct gl_perf_monitor_object *m)
1213 {
1214    struct brw_context *brw = brw_context(ctx);
1215    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1216
1217    reinitialize_perf_monitor(brw, monitor);
1218
1219    if (m->Active) {
1220       brw_begin_perf_monitor(ctx, m);
1221    }
1222 }
1223
1224 /**
1225  * Is a performance monitor result available?
1226  */
1227 static GLboolean
1228 brw_is_perf_monitor_result_available(struct gl_context *ctx,
1229                                      struct gl_perf_monitor_object *m)
1230 {
1231    struct brw_context *brw = brw_context(ctx);
1232    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1233
1234    bool oa_available = true;
1235    bool stats_available = true;
1236
1237    if (monitor_needs_oa(brw, m)) {
1238       oa_available = !monitor->oa_bo ||
1239          (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
1240           !drm_intel_bo_busy(monitor->oa_bo));
1241    }
1242
1243    if (monitor_needs_statistics_registers(brw, m)) {
1244       stats_available = !monitor->pipeline_stats_bo ||
1245          (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
1246           !drm_intel_bo_busy(monitor->pipeline_stats_bo));
1247    }
1248
1249    return oa_available && stats_available;
1250 }
1251
1252 /**
1253  * Get the performance monitor result.
1254  */
1255 static void
1256 brw_get_perf_monitor_result(struct gl_context *ctx,
1257                             struct gl_perf_monitor_object *m,
1258                             GLsizei data_size,
1259                             GLuint *data,
1260                             GLint *bytes_written)
1261 {
1262    struct brw_context *brw = brw_context(ctx);
1263    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1264
1265    DBG("GetResult(%d)\n", m->Name);
1266    brw_dump_perf_monitors(brw);
1267
1268    /* This hook should only be called when results are available. */
1269    assert(m->Ended);
1270
1271    /* Copy data to the supplied array (data).
1272     *
1273     * The output data format is: <group ID, counter ID, value> for each
1274     * active counter.  The API allows counters to appear in any order.
1275     */
1276    GLsizei offset = 0;
1277
1278    if (monitor_needs_oa(brw, m)) {
1279       /* Gather up the results from the BO, unless we already did due to the
1280        * bookend BO wrapping.
1281        */
1282       if (monitor->oa_bo) {
1283          /* Since the result is available, all the necessary snapshots will
1284           * have been written to the bookend BO.  If other monitors are
1285           * active, the bookend BO may be busy or referenced by the current
1286           * batch, but only for writing snapshots beyond oa_tail_start,
1287           * which we don't care about.
1288           *
1289           * Using an unsynchronized mapping avoids stalling for an
1290           * indeterminate amount of time.
1291           */
1292          drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
1293
1294          gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
1295
1296          drm_intel_bo_unmap(brw->perfmon.bookend_bo);
1297       }
1298
1299       for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
1300          int group = OA_COUNTERS;
1301          int counter = brw->perfmon.oa_snapshot_layout[i];
1302
1303          /* We always capture all the OA counters, but the application may
1304           * have only asked for a subset.  Skip unwanted counters.
1305           */
1306          if (counter < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
1307             continue;
1308
1309          data[offset++] = group;
1310          data[offset++] = counter;
1311          data[offset++] = monitor->oa_results[i];
1312       }
1313
1314       clean_bookend_bo(brw);
1315    }
1316
1317    if (monitor_needs_statistics_registers(brw, m)) {
1318       const int num_counters =
1319          ctx->PerfMonitor.Groups[PIPELINE_STATS_COUNTERS].NumCounters;
1320
1321       if (!monitor->pipeline_stats_results)
1322          gather_statistics_results(brw, monitor);
1323
1324       for (int i = 0; i < num_counters; i++) {
1325          if (BITSET_TEST(m->ActiveCounters[PIPELINE_STATS_COUNTERS], i)) {
1326             data[offset++] = PIPELINE_STATS_COUNTERS;
1327             data[offset++] = i;
1328             *((uint64_t *) (&data[offset])) = monitor->pipeline_stats_results[i];
1329             offset += 2;
1330          }
1331       }
1332    }
1333
1334    if (bytes_written)
1335       *bytes_written = offset * sizeof(uint32_t);
1336 }
1337
1338 /**
1339  * Create a new performance monitor object.
1340  */
1341 static struct gl_perf_monitor_object *
1342 brw_new_perf_monitor(struct gl_context *ctx)
1343 {
1344    return calloc(1, sizeof(struct brw_perf_monitor_object));
1345 }
1346
1347 /**
1348  * Delete a performance monitor object.
1349  */
1350 static void
1351 brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
1352 {
1353    struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
1354    DBG("Delete(%d)\n", m->Name);
1355    reinitialize_perf_monitor(brw_context(ctx), monitor);
1356    free(monitor);
1357 }
1358
1359 /******************************************************************************/
1360
1361 /**
1362  * Called at the start of every render ring batch.
1363  *
1364  * Enable OA counters and emit the "start of batchbuffer" bookend OA snapshot.
1365  * Since it's a new batch, there will be plenty of space for the commands.
1366  */
1367 void
1368 brw_perf_monitor_new_batch(struct brw_context *brw)
1369 {
1370    assert(brw->batch.ring == RENDER_RING);
1371    assert(brw->gen < 6 || brw->batch.used == 0);
1372
1373    if (brw->perfmon.oa_users == 0)
1374       return;
1375
1376    start_oa_counters(brw);
1377
1378    /* Make sure bookend_bo has enough space for a pair of snapshots.
1379     * If not, "wrap" the BO: gather up any results so far, and start from
1380     * the beginning of the buffer.  Reserving a pair guarantees that wrapping
1381     * will only happen at the beginning of a batch, where it's safe to map BOs
1382     * (as the batch is empty and can't refer to any of them yet).
1383     */
1384    if (!has_space_for_bookend_snapshots(brw, 2))
1385       wrap_bookend_bo(brw);
1386
1387    DBG("Bookend Begin Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
1388    emit_bookend_snapshot(brw);
1389 }
1390
1391 /**
1392  * Called at the end of every render ring batch.
1393  *
1394  * Emit the "end of batchbuffer" bookend OA snapshot and disable the counters.
1395  *
1396  * This relies on there being enough space in BATCH_RESERVED.
1397  */
1398 void
1399 brw_perf_monitor_finish_batch(struct brw_context *brw)
1400 {
1401    assert(brw->batch.ring == RENDER_RING);
1402
1403    if (brw->perfmon.oa_users == 0)
1404       return;
1405
1406    DBG("Bookend End Snapshot (%d)\n", brw->perfmon.bookend_snapshots);
1407
1408    /* Not safe to wrap; should've reserved space already. */
1409    assert(has_space_for_bookend_snapshots(brw, 1));
1410
1411    emit_bookend_snapshot(brw);
1412
1413    stop_oa_counters(brw);
1414 }
1415
1416 /******************************************************************************/
1417
1418 void
1419 brw_init_performance_monitors(struct brw_context *brw)
1420 {
1421    struct gl_context *ctx = &brw->ctx;
1422
1423    ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
1424    ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
1425    ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
1426    ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
1427    ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
1428    ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
1429    ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
1430
1431    if (brw->gen == 5) {
1432       ctx->PerfMonitor.Groups = gen5_groups;
1433       ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
1434       brw->perfmon.oa_snapshot_layout = gen5_oa_snapshot_layout;
1435       brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen5_oa_snapshot_layout);
1436    } else if (brw->gen == 6) {
1437       ctx->PerfMonitor.Groups = gen6_groups;
1438       ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen6_groups);
1439       brw->perfmon.oa_snapshot_layout = gen6_oa_snapshot_layout;
1440       brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen6_oa_snapshot_layout);
1441       brw->perfmon.statistics_registers = gen6_statistics_register_addresses;
1442    } else if (brw->gen == 7) {
1443       ctx->PerfMonitor.Groups = gen7_groups;
1444       ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen7_groups);
1445       brw->perfmon.oa_snapshot_layout = gen7_oa_snapshot_layout;
1446       brw->perfmon.entries_per_oa_snapshot = ARRAY_SIZE(gen7_oa_snapshot_layout);
1447       brw->perfmon.statistics_registers = gen7_statistics_register_addresses;
1448    }
1449
1450    brw->perfmon.unresolved =
1451       ralloc_array(brw, struct brw_perf_monitor_object *, 1);
1452    brw->perfmon.unresolved_elements = 0;
1453    brw->perfmon.unresolved_array_size = 1;
1454 }