src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "iris_context.h"
  39 #include "iris_defines.h"
  40 #include "iris_resource.h"
  41 #include "iris_screen.h"
  42 #include "vulkan/util/vk_util.h"
  43
  44 #define IA_VERTICES_COUNT          0x2310
  45 #define IA_PRIMITIVES_COUNT        0x2318
  46 #define VS_INVOCATION_COUNT        0x2320
  47 #define HS_INVOCATION_COUNT        0x2300
  48 #define DS_INVOCATION_COUNT        0x2308
  49 #define GS_INVOCATION_COUNT        0x2328
  50 #define GS_PRIMITIVES_COUNT        0x2330
  51 #define CL_INVOCATION_COUNT        0x2338
  52 #define CL_PRIMITIVES_COUNT        0x2340
  53 #define PS_INVOCATION_COUNT        0x2348
  54 #define CS_INVOCATION_COUNT        0x2290
  55 #define PS_DEPTH_COUNT             0x2350
  56
  57 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  58
  59 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  60
  61 #define MI_MATH (0x1a << 23)
  62
  63 #define MI_ALU_LOAD      0x080
  64 #define MI_ALU_LOADINV   0x480
  65 #define MI_ALU_LOAD0     0x081
  66 #define MI_ALU_LOAD1     0x481
  67 #define MI_ALU_ADD       0x100
  68 #define MI_ALU_SUB       0x101
  69 #define MI_ALU_AND       0x102
  70 #define MI_ALU_OR        0x103
  71 #define MI_ALU_XOR       0x104
  72 #define MI_ALU_STORE     0x180
  73 #define MI_ALU_STOREINV  0x580
  74
  75 #define MI_ALU_R0        0x00
  76 #define MI_ALU_R1        0x01
  77 #define MI_ALU_R2        0x02
  78 #define MI_ALU_R3        0x03
  79 #define MI_ALU_R4        0x04
  80 #define MI_ALU_SRCA      0x20
  81 #define MI_ALU_SRCB      0x21
  82 #define MI_ALU_ACCU      0x31
  83 #define MI_ALU_ZF        0x32
  84 #define MI_ALU_CF        0x33
  85
  86 #define _MI_ALU(op, x, y)  (((op) << 20) | ((x) << 10) | (y))
  87
  88 #define _MI_ALU0(op)       _MI_ALU(MI_ALU_##op, 0, 0)
  89 #define _MI_ALU1(op, x)    _MI_ALU(MI_ALU_##op, x, 0)
  90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
  91
  92 #define MI_ALU0(op)        _MI_ALU0(op)
  93 #define MI_ALU1(op, x)     _MI_ALU1(op, MI_ALU_##x)
  94 #define MI_ALU2(op, x, y)  _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
  95
  96 #define emit_lri32 ice->vtbl.load_register_imm32
  97 #define emit_lri64 ice->vtbl.load_register_imm64
  98 #define emit_lrr32 ice->vtbl.load_register_reg32
  99
 100 struct iris_query {
 101    enum pipe_query_type type;
 102    int index;
 103
 104    bool ready;
 105
 106    bool stalled;
 107
 108    uint64_t result;
 109
 110    struct iris_bo *bo;
 111    struct iris_query_snapshots *map;
 112
 113    int batch_idx;
 114 };
 115
 116 struct iris_query_snapshots {
 117    /** iris_render_condition's saved MI_PREDICATE_DATA value. */
 118    uint64_t predicate_data;
 119
 120    /** Have the start/end snapshots landed? */
 121    uint64_t snapshots_landed;
 122
 123    /** Starting and ending counter snapshots */
 124    uint64_t start;
 125    uint64_t end;
 126 };
 127
 128 struct iris_query_so_overflow {
 129    uint64_t predicate_data;
 130    uint64_t snapshots_landed;
 131
 132    struct {
 133       uint64_t prim_storage_needed[2];
 134       uint64_t num_prims[2];
 135    } stream[4];
 136 };
 137
 138 /**
 139  * Is this type of query written by PIPE_CONTROL?
 140  */
 141 static bool
 142 iris_is_query_pipelined(struct iris_query *q)
 143 {
 144    switch (q->type) {
 145    case PIPE_QUERY_OCCLUSION_COUNTER:
 146    case PIPE_QUERY_OCCLUSION_PREDICATE:
 147    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 148    case PIPE_QUERY_TIMESTAMP:
 149    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 150    case PIPE_QUERY_TIME_ELAPSED:
 151       return true;
 152
 153    default:
 154       return false;
 155    }
 156 }
 157
 158 static void
 159 mark_available(struct iris_context *ice, struct iris_query *q)
 160 {
 161    struct iris_batch *batch = &ice->batches[q->batch_idx];
 162    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 163    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 164
 165    if (!iris_is_query_pipelined(q)) {
 166       ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
 167    } else {
 168       /* Order available *after* the query results. */
 169       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 170       iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
 171    }
 172 }
 173
 174 /**
 175  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 176  */
 177 static void
 178 iris_pipelined_write(struct iris_batch *batch,
 179                      struct iris_query *q,
 180                      enum pipe_control_flags flags,
 181                      unsigned offset)
 182 {
 183    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 184    const unsigned optional_cs_stall =
 185       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 186
 187    iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
 188                                 q->bo, offset, 0ull);
 189 }
 190
 191 static void
 192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 193 {
 194    struct iris_batch *batch = &ice->batches[q->batch_idx];
 195    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 196
 197    if (!iris_is_query_pipelined(q)) {
 198       iris_emit_pipe_control_flush(batch,
 199                                    PIPE_CONTROL_CS_STALL |
 200                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 201       q->stalled = true;
 202    }
 203
 204    switch (q->type) {
 205    case PIPE_QUERY_OCCLUSION_COUNTER:
 206    case PIPE_QUERY_OCCLUSION_PREDICATE:
 207    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 208       if (devinfo->gen >= 10) {
 209          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 210           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 211           *  Count sync operation."
 212           */
 213          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
 214       }
 215       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 216                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 217                            PIPE_CONTROL_DEPTH_STALL,
 218                            offset);
 219       break;
 220    case PIPE_QUERY_TIME_ELAPSED:
 221    case PIPE_QUERY_TIMESTAMP:
 222    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 223       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 224                            PIPE_CONTROL_WRITE_TIMESTAMP,
 225                            offset);
 226       break;
 227    case PIPE_QUERY_PRIMITIVES_GENERATED:
 228       ice->vtbl.store_register_mem64(batch,
 229                                      q->index == 0 ? CL_INVOCATION_COUNT :
 230                                      SO_PRIM_STORAGE_NEEDED(q->index),
 231                                      q->bo, offset, false);
 232       break;
 233    case PIPE_QUERY_PRIMITIVES_EMITTED:
 234       ice->vtbl.store_register_mem64(batch,
 235                                      SO_NUM_PRIMS_WRITTEN(q->index),
 236                                      q->bo, offset, false);
 237       break;
 238    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
 239       static const uint32_t index_to_reg[] = {
 240          IA_VERTICES_COUNT,
 241          IA_PRIMITIVES_COUNT,
 242          VS_INVOCATION_COUNT,
 243          GS_INVOCATION_COUNT,
 244          GS_PRIMITIVES_COUNT,
 245          CL_INVOCATION_COUNT,
 246          CL_PRIMITIVES_COUNT,
 247          PS_INVOCATION_COUNT,
 248          HS_INVOCATION_COUNT,
 249          DS_INVOCATION_COUNT,
 250          CS_INVOCATION_COUNT,
 251       };
 252       const uint32_t reg = index_to_reg[q->index];
 253
 254       ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
 255       break;
 256    }
 257    default:
 258       assert(false);
 259    }
 260 }
 261
 262 static void
 263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 264 {
 265    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 266    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 267
 268    iris_emit_pipe_control_flush(batch,
 269                                 PIPE_CONTROL_CS_STALL |
 270                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 271    for (uint32_t i = 0; i < count; i++) {
 272       int s = q->index + i;
 273       int g_idx = offsetof(struct iris_query_so_overflow,
 274                            stream[s].num_prims[end]);
 275       int w_idx = offsetof(struct iris_query_so_overflow,
 276                            stream[s].prim_storage_needed[end]);
 277       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 278                                      q->bo, g_idx, false);
 279       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 280                                      q->bo, w_idx, false);
 281    }
 282 }
 283
 284 uint64_t
 285 iris_timebase_scale(const struct gen_device_info *devinfo,
 286                     uint64_t gpu_timestamp)
 287 {
 288    return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 289 }
 290
 291 static uint64_t
 292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 293 {
 294    if (time0 > time1) {
 295       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 296    } else {
 297       return time1 - time0;
 298    }
 299 }
 300
 301 static bool
 302 stream_overflowed(struct iris_query_so_overflow *so, int s)
 303 {
 304    return (so->stream[s].prim_storage_needed[1] -
 305            so->stream[s].prim_storage_needed[0]) !=
 306           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 307 }
 308
 309 static void
 310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 311                         struct iris_query *q)
 312 {
 313    switch (q->type) {
 314    case PIPE_QUERY_OCCLUSION_PREDICATE:
 315    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 316       q->result = q->map->end != q->map->start;
 317       break;
 318    case PIPE_QUERY_TIMESTAMP:
 319    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 320       /* The timestamp is the single starting snapshot. */
 321       q->result = iris_timebase_scale(devinfo, q->map->start);
 322       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 323       break;
 324    case PIPE_QUERY_TIME_ELAPSED:
 325       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 326       q->result = iris_timebase_scale(devinfo, q->result);
 327       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 328       break;
 329    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 330       q->result = stream_overflowed((void *) q->map, q->index);
 331       break;
 332    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 333       q->result = false;
 334       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 335          q->result |= stream_overflowed((void *) q->map, i);
 336       break;
 337    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
 338       q->result = q->map->end - q->map->start;
 339
 340       /* WaDividePSInvocationCountBy4:HSW,BDW */
 341       if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 342          q->result /= 4;
 343       break;
 344    case PIPE_QUERY_OCCLUSION_COUNTER:
 345    case PIPE_QUERY_PRIMITIVES_GENERATED:
 346    case PIPE_QUERY_PRIMITIVES_EMITTED:
 347    default:
 348       q->result = q->map->end - q->map->start;
 349       break;
 350    }
 351
 352    q->ready = true;
 353 }
 354
 355 static void
 356 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 357              unsigned reg_a, unsigned reg_b)
 358 {
 359    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 360
 361    math[0] = MI_MATH | (5 - 2);
 362    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 363    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 364    math[3] = _MI_ALU0(ADD);
 365    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 366 }
 367
 368 static void
 369 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 370              unsigned src_reg, unsigned shift)
 371 {
 372    assert(shift > 0);
 373
 374    int dwords = 1 + 4 * shift;
 375
 376    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 377
 378    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 379
 380    for (unsigned i = 0; i < shift; i++) {
 381       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 382       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 383       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 384       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 385       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 386    }
 387 }
 388
 389 /* Emit dwords to multiply GPR0 by N */
 390 static void
 391 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 392 {
 393    VK_OUTARRAY_MAKE(out, dw, dw_count);
 394
 395 #define APPEND_ALU(op, x, y) \
 396    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 397
 398    assert(N > 0);
 399    unsigned top_bit = 31 - __builtin_clz(N);
 400    for (int i = top_bit - 1; i >= 0; i--) {
 401       /* We get our initial data in GPR0 and we write the final data out to
 402        * GPR0 but we use GPR1 as our scratch register.
 403        */
 404       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 405       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 406
 407       /* Shift the current value left by 1 */
 408       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 409       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 410       APPEND_ALU(ADD, 0, 0);
 411
 412       if (N & (1 << i)) {
 413          /* Store ACCU to R1 and add R0 to R1 */
 414          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 415          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 416          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 417          APPEND_ALU(ADD, 0, 0);
 418       }
 419
 420       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 421    }
 422
 423 #undef APPEND_ALU
 424 }
 425
 426 static void
 427 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 428 {
 429    uint32_t num_dwords;
 430    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 431
 432    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 433    math[0] = MI_MATH | (num_dwords - 2);
 434    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 435 }
 436
 437 void
 438 iris_math_div32_gpr0(struct iris_context *ice,
 439                      struct iris_batch *batch,
 440                      uint32_t D)
 441 {
 442    /* Zero out the top of GPR0 */
 443    emit_lri32(batch, CS_GPR(0) + 4, 0);
 444
 445    if (D == 0) {
 446       /* This invalid, but we should do something so we set GPR0 to 0. */
 447       emit_lri32(batch, CS_GPR(0), 0);
 448    } else if (util_is_power_of_two_or_zero(D)) {
 449       unsigned log2_D = util_logbase2(D);
 450       assert(log2_D < 32);
 451       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 452        * the top 32 bits of the result.
 453        */
 454       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 455       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 456       emit_lri32(batch, CS_GPR(0) + 4, 0);
 457    } else {
 458       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 459       assert(m.multiplier <= UINT32_MAX);
 460
 461       if (m.pre_shift) {
 462          /* We right-shift by L by left-shifting by 32 - l and taking the top
 463           * 32 bits of the result.
 464           */
 465          if (m.pre_shift < 32)
 466             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 467          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 468          emit_lri32(batch, CS_GPR(0) + 4, 0);
 469       }
 470
 471       /* Do the 32x32 multiply into gpr0 */
 472       emit_mul_gpr0(batch, m.multiplier);
 473
 474       if (m.increment) {
 475          /* If we need to increment, save off a copy of GPR0 */
 476          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 477          emit_lri32(batch, CS_GPR(1) + 4, 0);
 478          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 479       }
 480
 481       /* Shift by 32 */
 482       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 483       emit_lri32(batch, CS_GPR(0) + 4, 0);
 484
 485       if (m.post_shift) {
 486          /* We right-shift by L by left-shifting by 32 - l and taking the top
 487           * 32 bits of the result.
 488           */
 489          if (m.post_shift < 32)
 490             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 491          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 492          emit_lri32(batch, CS_GPR(0) + 4, 0);
 493       }
 494    }
 495 }
 496
 497 /*
 498  * GPR0 = (GPR0 == 0) ? 0 : 1;
 499  */
 500 static void
 501 gpr0_to_bool(struct iris_context *ice)
 502 {
 503    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 504
 505    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 506
 507    static const uint32_t math[] = {
 508       MI_MATH | (9 - 2),
 509       MI_ALU2(LOAD, SRCA, R0),
 510       MI_ALU1(LOAD0, SRCB),
 511       MI_ALU0(ADD),
 512       MI_ALU2(STOREINV, R0, ZF),
 513       MI_ALU2(LOAD, SRCA, R0),
 514       MI_ALU2(LOAD, SRCB, R1),
 515       MI_ALU0(AND),
 516       MI_ALU2(STORE, R0, ACCU),
 517    };
 518    iris_batch_emit(batch, math, sizeof(math));
 519 }
 520
 521 static void
 522 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 523                               struct iris_query *q,
 524                               int idx)
 525 {
 526    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 527
 528    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 529                                  offsetof(struct iris_query_so_overflow,
 530                                           stream[idx].prim_storage_needed[0]));
 531    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 532                                  offsetof(struct iris_query_so_overflow,
 533                                           stream[idx].prim_storage_needed[1]));
 534
 535    ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
 536                                  offsetof(struct iris_query_so_overflow,
 537                                           stream[idx].num_prims[0]));
 538    ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
 539                                  offsetof(struct iris_query_so_overflow,
 540                                           stream[idx].num_prims[1]));
 541 }
 542
 543 /*
 544  * R3 = R4 - R3;
 545  * R1 = R2 - R1;
 546  * R1 = R3 - R1;
 547  * R0 = R0 | R1;
 548  */
 549 static void
 550 calc_overflow_for_stream(struct iris_context *ice)
 551 {
 552    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 553    static const uint32_t maths[] = {
 554       MI_MATH | (17 - 2),
 555       MI_ALU2(LOAD, SRCA, R4),
 556       MI_ALU2(LOAD, SRCB, R3),
 557       MI_ALU0(SUB),
 558       MI_ALU2(STORE, R3, ACCU),
 559       MI_ALU2(LOAD, SRCA, R2),
 560       MI_ALU2(LOAD, SRCB, R1),
 561       MI_ALU0(SUB),
 562       MI_ALU2(STORE, R1, ACCU),
 563       MI_ALU2(LOAD, SRCA, R3),
 564       MI_ALU2(LOAD, SRCB, R1),
 565       MI_ALU0(SUB),
 566       MI_ALU2(STORE, R1, ACCU),
 567       MI_ALU2(LOAD, SRCA, R1),
 568       MI_ALU2(LOAD, SRCB, R0),
 569       MI_ALU0(OR),
 570       MI_ALU2(STORE, R0, ACCU),
 571    };
 572
 573    iris_batch_emit(batch, maths, sizeof(maths));
 574 }
 575
 576 static void
 577 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 578 {
 579    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 580
 581    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 582
 583    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 584       load_overflow_data_to_cs_gprs(ice, q, q->index);
 585       calc_overflow_for_stream(ice);
 586    } else {
 587       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 588          load_overflow_data_to_cs_gprs(ice, q, i);
 589          calc_overflow_for_stream(ice);
 590       }
 591    }
 592
 593    gpr0_to_bool(ice);
 594 }
 595
 596 /*
 597  * GPR0 = GPR0 & ((1ull << n) -1);
 598  */
 599 static void
 600 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
 601 {
 602    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 603
 604    ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 605    static const uint32_t math[] = {
 606       MI_MATH | (5 - 2),
 607       MI_ALU2(LOAD, SRCA, R0),
 608       MI_ALU2(LOAD, SRCB, R1),
 609       MI_ALU0(AND),
 610       MI_ALU2(STORE, R0, ACCU),
 611    };
 612    iris_batch_emit(batch, math, sizeof(math));
 613 }
 614
 615 /*
 616  * GPR0 = GPR0 << 30;
 617  */
 618 static void
 619 shl_gpr0_by_30_bits(struct iris_context *ice)
 620 {
 621    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 622    /* First we mask 34 bits of GPR0 to prevent overflow */
 623    keep_gpr0_lower_n_bits(ice, 34);
 624
 625    static const uint32_t shl_math[] = {
 626       MI_ALU2(LOAD, SRCA, R0),
 627       MI_ALU2(LOAD, SRCB, R0),
 628       MI_ALU0(ADD),
 629       MI_ALU2(STORE, R0, ACCU),
 630    };
 631
 632    const uint32_t outer_count = 5;
 633    const uint32_t inner_count = 6;
 634    const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
 635    const uint32_t batch_len = cmd_len * outer_count;
 636    uint32_t *map = iris_get_command_space(batch, batch_len * 4);
 637    uint32_t offset = 0;
 638    for (int o = 0; o < outer_count; o++) {
 639       map[offset++] = MI_MATH | (cmd_len - 2);
 640       for (int i = 0; i < inner_count; i++) {
 641          memcpy(&map[offset], shl_math, sizeof(shl_math));
 642          offset += 4;
 643       }
 644    }
 645 }
 646
 647 /*
 648  * GPR0 = GPR0 >> 2;
 649  *
 650  * Note that the upper 30 bits of GPR0 are lost!
 651  */
 652 static void
 653 shr_gpr0_by_2_bits(struct iris_context *ice)
 654 {
 655    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 656    shl_gpr0_by_30_bits(ice);
 657    ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 658    ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
 659 }
 660
 661 /**
 662  * Calculate the result and store it to CS_GPR0.
 663  */
 664 static void
 665 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 666 {
 667    struct iris_batch *batch = &ice->batches[q->batch_idx];
 668    struct iris_screen *screen = (void *) ice->ctx.screen;
 669    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 670
 671    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 672        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 673       overflow_result_to_gpr0(ice, q);
 674       return;
 675    }
 676
 677    if (q->type == PIPE_QUERY_TIMESTAMP) {
 678       ice->vtbl.load_register_mem64(batch, CS_GPR(0), q->bo,
 679                                     offsetof(struct iris_query_snapshots, start));
 680       /* TODO: This discards any fractional bits of the timebase scale.
 681        * We would need to do a bit of fixed point math on the CS ALU, or
 682        * launch an actual shader to calculate this with full precision.
 683        */
 684       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 685       keep_gpr0_lower_n_bits(ice, 36);
 686       return;
 687    }
 688
 689    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 690                                  offsetof(struct iris_query_snapshots, start));
 691    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 692                                  offsetof(struct iris_query_snapshots, end));
 693
 694    static const uint32_t math[] = {
 695       MI_MATH | (5 - 2),
 696       MI_ALU2(LOAD, SRCA, R2),
 697       MI_ALU2(LOAD, SRCB, R1),
 698       MI_ALU0(SUB),
 699       MI_ALU2(STORE, R0, ACCU),
 700    };
 701    iris_batch_emit(batch, math, sizeof(math));
 702
 703    /* WaDividePSInvocationCountBy4:HSW,BDW */
 704    if (devinfo->gen == 8 &&
 705        q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 706        q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 707       shr_gpr0_by_2_bits(ice);
 708
 709    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 710        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 711       gpr0_to_bool(ice);
 712
 713    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
 714       /* TODO: This discards fractional bits (see above). */
 715       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 716    }
 717 }
 718
 719 static struct pipe_query *
 720 iris_create_query(struct pipe_context *ctx,
 721                   unsigned query_type,
 722                   unsigned index)
 723 {
 724    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 725
 726    q->type = query_type;
 727    q->index = index;
 728
 729    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 730        q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
 731       q->batch_idx = IRIS_BATCH_COMPUTE;
 732    else
 733       q->batch_idx = IRIS_BATCH_RENDER;
 734    return (struct pipe_query *) q;
 735 }
 736
 737 static void
 738 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 739 {
 740    struct iris_query *query = (void *) p_query;
 741    iris_bo_unreference(query->bo);
 742    free(query);
 743 }
 744
 745
 746 static boolean
 747 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 748 {
 749    struct iris_screen *screen = (void *) ctx->screen;
 750    struct iris_context *ice = (void *) ctx;
 751    struct iris_query *q = (void *) query;
 752
 753    iris_bo_unreference(q->bo);
 754    q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
 755                          IRIS_MEMZONE_OTHER);
 756    if (!q->bo)
 757       return false;
 758
 759    q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
 760    if (!q->map)
 761       return false;
 762
 763    q->result = 0ull;
 764    q->ready = false;
 765    q->map->snapshots_landed = false;
 766
 767    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 768       ice->state.prims_generated_query_active = true;
 769       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 770    }
 771
 772    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 773        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 774       write_overflow_values(ice, q, false);
 775    else
 776       write_value(ice, q, offsetof(struct iris_query_snapshots, start));
 777
 778    return true;
 779 }
 780
 781 static bool
 782 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 783 {
 784    struct iris_context *ice = (void *) ctx;
 785    struct iris_query *q = (void *) query;
 786
 787    if (q->type == PIPE_QUERY_TIMESTAMP) {
 788       iris_begin_query(ctx, query);
 789       mark_available(ice, q);
 790       return true;
 791    }
 792
 793    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 794       ice->state.prims_generated_query_active = false;
 795       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 796    }
 797
 798    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 799        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 800       write_overflow_values(ice, q, true);
 801    else
 802       write_value(ice, q, offsetof(struct iris_query_snapshots, end));
 803    mark_available(ice, q);
 804
 805    return true;
 806 }
 807
 808 /**
 809  * See if the snapshots have landed for a query, and if so, compute the
 810  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 811  */
 812 static void
 813 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 814 {
 815    struct iris_screen *screen = (void *) ice->ctx.screen;
 816    const struct gen_device_info *devinfo = &screen->devinfo;
 817
 818    if (!q->ready && q->map->snapshots_landed) {
 819       calculate_result_on_cpu(devinfo, q);
 820    }
 821 }
 822
 823 static boolean
 824 iris_get_query_result(struct pipe_context *ctx,
 825                       struct pipe_query *query,
 826                       boolean wait,
 827                       union pipe_query_result *result)
 828 {
 829    struct iris_context *ice = (void *) ctx;
 830    struct iris_query *q = (void *) query;
 831    struct iris_screen *screen = (void *) ctx->screen;
 832    const struct gen_device_info *devinfo = &screen->devinfo;
 833
 834    if (!q->ready) {
 835       if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
 836          iris_batch_flush(&ice->batches[q->batch_idx]);
 837
 838       if (!q->map->snapshots_landed) {
 839          if (wait)
 840             iris_bo_wait_rendering(q->bo);
 841          else
 842             return false;
 843       }
 844
 845       assert(q->map->snapshots_landed);
 846       calculate_result_on_cpu(devinfo, q);
 847    }
 848
 849    assert(q->ready);
 850
 851    result->u64 = q->result;
 852
 853    return true;
 854 }
 855
 856 static void
 857 iris_get_query_result_resource(struct pipe_context *ctx,
 858                                struct pipe_query *query,
 859                                boolean wait,
 860                                enum pipe_query_value_type result_type,
 861                                int index,
 862                                struct pipe_resource *p_res,
 863                                unsigned offset)
 864 {
 865    struct iris_context *ice = (void *) ctx;
 866    struct iris_query *q = (void *) query;
 867    struct iris_batch *batch = &ice->batches[q->batch_idx];
 868    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 869    struct iris_resource *res = (void *) p_res;
 870    unsigned snapshots_landed_offset =
 871       offsetof(struct iris_query_snapshots, snapshots_landed);
 872
 873    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 874
 875    if (index == -1) {
 876       /* They're asking for the availability of the result.  If we still
 877        * have commands queued up which produce the result, submit them
 878        * now so that progress happens.  Either way, copy the snapshots
 879        * landed field to the destination resource.
 880        */
 881       if (iris_batch_references(batch, q->bo))
 882          iris_batch_flush(batch);
 883
 884       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 885                              q->bo, snapshots_landed_offset,
 886                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 887       return;
 888    }
 889
 890    if (!q->ready && q->map->snapshots_landed) {
 891       /* The final snapshots happen to have landed, so let's just compute
 892        * the result on the CPU now...
 893        */
 894       calculate_result_on_cpu(devinfo, q);
 895    }
 896
 897    if (q->ready) {
 898       /* We happen to have the result on the CPU, so just copy it. */
 899       if (result_type <= PIPE_QUERY_TYPE_U32) {
 900          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 901                                     q->result);
 902       } else {
 903          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 904                                     q->result);
 905       }
 906
 907       /* Make sure the result lands before they use bind the QBO elsewhere
 908        * and use the result.
 909        */
 910       // XXX: Why?  i965 doesn't do this.
 911       iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
 912       return;
 913    }
 914
 915    /* Calculate the result to CS_GPR0 */
 916    calculate_result_on_gpu(ice, q);
 917
 918    bool predicated = !wait && !q->stalled;
 919
 920    if (predicated) {
 921       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 922       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 923                                     snapshots_landed_offset);
 924       uint32_t predicate = MI_PREDICATE |
 925                            MI_PREDICATE_LOADOP_LOADINV |
 926                            MI_PREDICATE_COMBINEOP_SET |
 927                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 928       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 929    }
 930
 931    if (result_type <= PIPE_QUERY_TYPE_U32) {
 932       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 933                                      iris_resource_bo(p_res),
 934                                      offset, predicated);
 935    } else {
 936       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 937                                      iris_resource_bo(p_res),
 938                                      offset, predicated);
 939    }
 940 }
 941
 942 static void
 943 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 944 {
 945    struct iris_context *ice = (void *) ctx;
 946
 947    if (ice->state.statistics_counters_enabled == enable)
 948       return;
 949
 950    // XXX: most packets aren't paying attention to this yet, because it'd
 951    // have to be done dynamically at draw time, which is a pain
 952    ice->state.statistics_counters_enabled = enable;
 953    ice->state.dirty |= IRIS_DIRTY_CLIP |
 954                        IRIS_DIRTY_GS |
 955                        IRIS_DIRTY_RASTER |
 956                        IRIS_DIRTY_STREAMOUT |
 957                        IRIS_DIRTY_TCS |
 958                        IRIS_DIRTY_TES |
 959                        IRIS_DIRTY_VS |
 960                        IRIS_DIRTY_WM;
 961 }
 962
 963 static void
 964 set_predicate_enable(struct iris_context *ice, bool value)
 965 {
 966    if (value)
 967       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
 968    else
 969       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
 970 }
 971
 972 static void
 973 set_predicate_for_result(struct iris_context *ice,
 974                          struct iris_query *q,
 975                          bool inverted)
 976 {
 977    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 978
 979    /* The CPU doesn't have the query result yet; use hardware predication */
 980    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
 981
 982    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
 983    iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
 984    q->stalled = true;
 985
 986    switch (q->type) {
 987    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 988    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 989       overflow_result_to_gpr0(ice, q);
 990
 991       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
 992       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 993       break;
 994    default:
 995       /* PIPE_QUERY_OCCLUSION_* */
 996       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 997          offsetof(struct iris_query_snapshots, start));
 998       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
 999          offsetof(struct iris_query_snapshots, end));
1000       break;
1001    }
1002
1003    uint32_t mi_predicate = MI_PREDICATE |
1004                            MI_PREDICATE_COMBINEOP_SET |
1005                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1006                            (inverted ? MI_PREDICATE_LOADOP_LOAD
1007                                      : MI_PREDICATE_LOADOP_LOADINV);
1008    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1009
1010    /* We immediately set the predicate on the render batch, as all the
1011     * counters come from 3D operations.  However, we may need to predicate
1012     * a compute dispatch, which executes in a different GEM context and has
1013     * a different MI_PREDICATE_DATA register.  So, we save the result to
1014     * memory and reload it in iris_launch_grid.
1015     */
1016    unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
1017    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1018                                   q->bo, offset, false);
1019    ice->state.compute_predicate = q->bo;
1020 }
1021
1022 static void
1023 iris_render_condition(struct pipe_context *ctx,
1024                       struct pipe_query *query,
1025                       boolean condition,
1026                       enum pipe_render_cond_flag mode)
1027 {
1028    struct iris_context *ice = (void *) ctx;
1029    struct iris_query *q = (void *) query;
1030
1031    if (!q) {
1032       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1033       return;
1034    }
1035
1036    iris_check_query_no_flush(ice, q);
1037
1038    if (q->result || q->ready) {
1039       set_predicate_enable(ice, (q->result != 0) ^ condition);
1040    } else {
1041       if (mode == PIPE_RENDER_COND_NO_WAIT ||
1042           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1043          perf_debug(&ice->dbg, "Conditional rendering demoted from "
1044                     "\"no wait\" to \"wait\".");
1045       }
1046       set_predicate_for_result(ice, q, condition);
1047    }
1048 }
1049
1050 void
1051 iris_init_query_functions(struct pipe_context *ctx)
1052 {
1053    ctx->create_query = iris_create_query;
1054    ctx->destroy_query = iris_destroy_query;
1055    ctx->begin_query = iris_begin_query;
1056    ctx->end_query = iris_end_query;
1057    ctx->get_query_result = iris_get_query_result;
1058    ctx->get_query_result_resource = iris_get_query_result_resource;
1059    ctx->set_active_query_state = iris_set_active_query_state;
1060    ctx->render_condition = iris_render_condition;
1061 }