src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "iris_context.h"
  39 #include "iris_defines.h"
  40 #include "iris_resource.h"
  41 #include "iris_screen.h"
  42 #include "vulkan/util/vk_util.h"
  43
  44 #define IA_VERTICES_COUNT          0x2310
  45 #define IA_PRIMITIVES_COUNT        0x2318
  46 #define VS_INVOCATION_COUNT        0x2320
  47 #define HS_INVOCATION_COUNT        0x2300
  48 #define DS_INVOCATION_COUNT        0x2308
  49 #define GS_INVOCATION_COUNT        0x2328
  50 #define GS_PRIMITIVES_COUNT        0x2330
  51 #define CL_INVOCATION_COUNT        0x2338
  52 #define CL_PRIMITIVES_COUNT        0x2340
  53 #define PS_INVOCATION_COUNT        0x2348
  54 #define CS_INVOCATION_COUNT        0x2290
  55 #define PS_DEPTH_COUNT             0x2350
  56
  57 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  58
  59 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  60
  61 #define MI_MATH (0x1a << 23)
  62
  63 #define MI_ALU_LOAD      0x080
  64 #define MI_ALU_LOADINV   0x480
  65 #define MI_ALU_LOAD0     0x081
  66 #define MI_ALU_LOAD1     0x481
  67 #define MI_ALU_ADD       0x100
  68 #define MI_ALU_SUB       0x101
  69 #define MI_ALU_AND       0x102
  70 #define MI_ALU_OR        0x103
  71 #define MI_ALU_XOR       0x104
  72 #define MI_ALU_STORE     0x180
  73 #define MI_ALU_STOREINV  0x580
  74
  75 #define MI_ALU_R0        0x00
  76 #define MI_ALU_R1        0x01
  77 #define MI_ALU_R2        0x02
  78 #define MI_ALU_R3        0x03
  79 #define MI_ALU_R4        0x04
  80 #define MI_ALU_SRCA      0x20
  81 #define MI_ALU_SRCB      0x21
  82 #define MI_ALU_ACCU      0x31
  83 #define MI_ALU_ZF        0x32
  84 #define MI_ALU_CF        0x33
  85
  86 #define _MI_ALU(op, x, y)  (((op) << 20) | ((x) << 10) | (y))
  87
  88 #define _MI_ALU0(op)       _MI_ALU(MI_ALU_##op, 0, 0)
  89 #define _MI_ALU1(op, x)    _MI_ALU(MI_ALU_##op, x, 0)
  90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
  91
  92 #define MI_ALU0(op)        _MI_ALU0(op)
  93 #define MI_ALU1(op, x)     _MI_ALU1(op, MI_ALU_##x)
  94 #define MI_ALU2(op, x, y)  _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
  95
  96 #define emit_lri32 ice->vtbl.load_register_imm32
  97 #define emit_lri64 ice->vtbl.load_register_imm64
  98 #define emit_lrr32 ice->vtbl.load_register_reg32
  99
 100 struct iris_query {
 101    enum pipe_query_type type;
 102    int index;
 103
 104    bool ready;
 105
 106    bool stalled;
 107
 108    uint64_t result;
 109
 110    struct iris_bo *bo;
 111    struct iris_query_snapshots *map;
 112
 113    int batch_idx;
 114 };
 115
 116 struct iris_query_snapshots {
 117    /** iris_render_condition's saved MI_PREDICATE_DATA value. */
 118    uint64_t predicate_data;
 119
 120    /** Have the start/end snapshots landed? */
 121    uint64_t snapshots_landed;
 122
 123    /** Starting and ending counter snapshots */
 124    uint64_t start;
 125    uint64_t end;
 126 };
 127
 128 struct iris_query_so_overflow {
 129    uint64_t predicate_data;
 130    uint64_t snapshots_landed;
 131
 132    struct {
 133       uint64_t prim_storage_needed[2];
 134       uint64_t num_prims[2];
 135    } stream[4];
 136 };
 137
 138 /**
 139  * Is this type of query written by PIPE_CONTROL?
 140  */
 141 static bool
 142 iris_is_query_pipelined(struct iris_query *q)
 143 {
 144    switch (q->type) {
 145    case PIPE_QUERY_OCCLUSION_COUNTER:
 146    case PIPE_QUERY_OCCLUSION_PREDICATE:
 147    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 148    case PIPE_QUERY_TIMESTAMP:
 149    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 150    case PIPE_QUERY_TIME_ELAPSED:
 151       return true;
 152
 153    default:
 154       return false;
 155    }
 156 }
 157
 158 static void
 159 mark_available(struct iris_context *ice, struct iris_query *q)
 160 {
 161    struct iris_batch *batch = &ice->batches[q->batch_idx];
 162    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 163    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 164
 165    if (!iris_is_query_pipelined(q)) {
 166       ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
 167    } else {
 168       /* Order available *after* the query results. */
 169       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 170       iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
 171    }
 172 }
 173
 174 /**
 175  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 176  */
 177 static void
 178 iris_pipelined_write(struct iris_batch *batch,
 179                      struct iris_query *q,
 180                      enum pipe_control_flags flags,
 181                      unsigned offset)
 182 {
 183    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 184    const unsigned optional_cs_stall =
 185       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 186
 187    iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
 188                                 q->bo, offset, 0ull);
 189 }
 190
 191 static void
 192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 193 {
 194    struct iris_batch *batch = &ice->batches[q->batch_idx];
 195    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 196
 197    if (!iris_is_query_pipelined(q)) {
 198       iris_emit_pipe_control_flush(batch,
 199                                    PIPE_CONTROL_CS_STALL |
 200                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 201       q->stalled = true;
 202    }
 203
 204    switch (q->type) {
 205    case PIPE_QUERY_OCCLUSION_COUNTER:
 206    case PIPE_QUERY_OCCLUSION_PREDICATE:
 207    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 208       if (devinfo->gen >= 10) {
 209          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 210           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 211           *  Count sync operation."
 212           */
 213          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
 214       }
 215       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 216                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 217                            PIPE_CONTROL_DEPTH_STALL,
 218                            offset);
 219       break;
 220    case PIPE_QUERY_TIME_ELAPSED:
 221    case PIPE_QUERY_TIMESTAMP:
 222    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 223       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 224                            PIPE_CONTROL_WRITE_TIMESTAMP,
 225                            offset);
 226       break;
 227    case PIPE_QUERY_PRIMITIVES_GENERATED:
 228       ice->vtbl.store_register_mem64(batch,
 229                                      q->index == 0 ? CL_INVOCATION_COUNT :
 230                                      SO_PRIM_STORAGE_NEEDED(q->index),
 231                                      q->bo, offset, false);
 232       break;
 233    case PIPE_QUERY_PRIMITIVES_EMITTED:
 234       ice->vtbl.store_register_mem64(batch,
 235                                      SO_NUM_PRIMS_WRITTEN(q->index),
 236                                      q->bo, offset, false);
 237       break;
 238    case PIPE_QUERY_PIPELINE_STATISTICS: {
 239       static const uint32_t index_to_reg[] = {
 240          IA_VERTICES_COUNT,
 241          IA_PRIMITIVES_COUNT,
 242          VS_INVOCATION_COUNT,
 243          GS_INVOCATION_COUNT,
 244          GS_PRIMITIVES_COUNT,
 245          CL_INVOCATION_COUNT,
 246          CL_PRIMITIVES_COUNT,
 247          PS_INVOCATION_COUNT,
 248          HS_INVOCATION_COUNT,
 249          DS_INVOCATION_COUNT,
 250          CS_INVOCATION_COUNT,
 251       };
 252       const uint32_t reg = index_to_reg[q->index];
 253
 254       ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
 255       break;
 256    }
 257    default:
 258       assert(false);
 259    }
 260 }
 261
 262 static void
 263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 264 {
 265    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 266    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 267
 268    iris_emit_pipe_control_flush(batch,
 269                                 PIPE_CONTROL_CS_STALL |
 270                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 271    for (uint32_t i = 0; i < count; i++) {
 272       int s = q->index + i;
 273       int g_idx = offsetof(struct iris_query_so_overflow,
 274                            stream[s].num_prims[end]);
 275       int w_idx = offsetof(struct iris_query_so_overflow,
 276                            stream[s].prim_storage_needed[end]);
 277       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 278                                      q->bo, g_idx, false);
 279       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 280                                      q->bo, w_idx, false);
 281    }
 282 }
 283
 284 uint64_t
 285 iris_timebase_scale(const struct gen_device_info *devinfo,
 286                     uint64_t gpu_timestamp)
 287 {
 288    return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 289 }
 290
 291 static uint64_t
 292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 293 {
 294    if (time0 > time1) {
 295       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 296    } else {
 297       return time1 - time0;
 298    }
 299 }
 300
 301 static bool
 302 stream_overflowed(struct iris_query_so_overflow *so, int s)
 303 {
 304    return (so->stream[s].prim_storage_needed[1] -
 305            so->stream[s].prim_storage_needed[0]) !=
 306           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 307 }
 308
 309 static void
 310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 311                         struct iris_query *q)
 312 {
 313    switch (q->type) {
 314    case PIPE_QUERY_OCCLUSION_PREDICATE:
 315    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 316       q->result = q->map->end != q->map->start;
 317       break;
 318    case PIPE_QUERY_TIMESTAMP:
 319    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 320       /* The timestamp is the single starting snapshot. */
 321       q->result = iris_timebase_scale(devinfo, q->map->start);
 322       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 323       break;
 324    case PIPE_QUERY_TIME_ELAPSED:
 325       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 326       q->result = iris_timebase_scale(devinfo, q->result);
 327       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 328       break;
 329    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 330       q->result = stream_overflowed((void *) q->map, q->index);
 331       break;
 332    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 333       q->result = false;
 334       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 335          q->result |= stream_overflowed((void *) q->map, i);
 336       break;
 337    case PIPE_QUERY_PIPELINE_STATISTICS:
 338       q->result = q->map->end - q->map->start;
 339
 340       /* WaDividePSInvocationCountBy4:HSW,BDW */
 341       if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 342          q->result /= 4;
 343       break;
 344    case PIPE_QUERY_OCCLUSION_COUNTER:
 345    case PIPE_QUERY_PRIMITIVES_GENERATED:
 346    case PIPE_QUERY_PRIMITIVES_EMITTED:
 347    default:
 348       q->result = q->map->end - q->map->start;
 349       break;
 350    }
 351
 352    q->ready = true;
 353 }
 354
 355 static void
 356 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 357              unsigned reg_a, unsigned reg_b)
 358 {
 359    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 360
 361    math[0] = MI_MATH | (5 - 2);
 362    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 363    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 364    math[3] = _MI_ALU0(ADD);
 365    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 366 }
 367
 368 static void
 369 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 370              unsigned src_reg, unsigned shift)
 371 {
 372    assert(shift > 0);
 373
 374    int dwords = 1 + 4 * shift;
 375
 376    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 377
 378    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 379
 380    for (unsigned i = 0; i < shift; i++) {
 381       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 382       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 383       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 384       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 385       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 386    }
 387 }
 388
 389 /* Emit dwords to multiply GPR0 by N */
 390 static void
 391 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 392 {
 393    VK_OUTARRAY_MAKE(out, dw, dw_count);
 394
 395 #define APPEND_ALU(op, x, y) \
 396    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 397
 398    assert(N > 0);
 399    unsigned top_bit = 31 - __builtin_clz(N);
 400    for (int i = top_bit - 1; i >= 0; i--) {
 401       /* We get our initial data in GPR0 and we write the final data out to
 402        * GPR0 but we use GPR1 as our scratch register.
 403        */
 404       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 405       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 406
 407       /* Shift the current value left by 1 */
 408       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 409       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 410       APPEND_ALU(ADD, 0, 0);
 411
 412       if (N & (1 << i)) {
 413          /* Store ACCU to R1 and add R0 to R1 */
 414          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 415          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 416          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 417          APPEND_ALU(ADD, 0, 0);
 418       }
 419
 420       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 421    }
 422
 423 #undef APPEND_ALU
 424 }
 425
 426 static void
 427 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 428 {
 429    uint32_t num_dwords;
 430    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 431
 432    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 433    math[0] = MI_MATH | (num_dwords - 2);
 434    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 435 }
 436
 437 void
 438 iris_math_div32_gpr0(struct iris_context *ice,
 439                      struct iris_batch *batch,
 440                      uint32_t D)
 441 {
 442    /* Zero out the top of GPR0 */
 443    emit_lri32(batch, CS_GPR(0) + 4, 0);
 444
 445    if (D == 0) {
 446       /* This invalid, but we should do something so we set GPR0 to 0. */
 447       emit_lri32(batch, CS_GPR(0), 0);
 448    } else if (util_is_power_of_two_or_zero(D)) {
 449       unsigned log2_D = util_logbase2(D);
 450       assert(log2_D < 32);
 451       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 452        * the top 32 bits of the result.
 453        */
 454       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 455       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 456       emit_lri32(batch, CS_GPR(0) + 4, 0);
 457    } else {
 458       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 459       assert(m.multiplier <= UINT32_MAX);
 460
 461       if (m.pre_shift) {
 462          /* We right-shift by L by left-shifting by 32 - l and taking the top
 463           * 32 bits of the result.
 464           */
 465          if (m.pre_shift < 32)
 466             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 467          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 468          emit_lri32(batch, CS_GPR(0) + 4, 0);
 469       }
 470
 471       /* Do the 32x32 multiply into gpr0 */
 472       emit_mul_gpr0(batch, m.multiplier);
 473
 474       if (m.increment) {
 475          /* If we need to increment, save off a copy of GPR0 */
 476          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 477          emit_lri32(batch, CS_GPR(1) + 4, 0);
 478          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 479       }
 480
 481       /* Shift by 32 */
 482       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 483       emit_lri32(batch, CS_GPR(0) + 4, 0);
 484
 485       if (m.post_shift) {
 486          /* We right-shift by L by left-shifting by 32 - l and taking the top
 487           * 32 bits of the result.
 488           */
 489          if (m.post_shift < 32)
 490             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 491          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 492          emit_lri32(batch, CS_GPR(0) + 4, 0);
 493       }
 494    }
 495 }
 496
 497 /*
 498  * GPR0 = (GPR0 == 0) ? 0 : 1;
 499  */
 500 static void
 501 gpr0_to_bool(struct iris_context *ice)
 502 {
 503    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 504
 505    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 506
 507    static const uint32_t math[] = {
 508       MI_MATH | (9 - 2),
 509       MI_ALU2(LOAD, SRCA, R0),
 510       MI_ALU1(LOAD0, SRCB),
 511       MI_ALU0(ADD),
 512       MI_ALU2(STOREINV, R0, ZF),
 513       MI_ALU2(LOAD, SRCA, R0),
 514       MI_ALU2(LOAD, SRCB, R1),
 515       MI_ALU0(AND),
 516       MI_ALU2(STORE, R0, ACCU),
 517    };
 518    iris_batch_emit(batch, math, sizeof(math));
 519 }
 520
 521 static void
 522 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 523                               struct iris_query *q,
 524                               int idx)
 525 {
 526    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 527
 528    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 529                                  offsetof(struct iris_query_so_overflow,
 530                                           stream[idx].prim_storage_needed[0]));
 531    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 532                                  offsetof(struct iris_query_so_overflow,
 533                                           stream[idx].prim_storage_needed[1]));
 534
 535    ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
 536                                  offsetof(struct iris_query_so_overflow,
 537                                           stream[idx].num_prims[0]));
 538    ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
 539                                  offsetof(struct iris_query_so_overflow,
 540                                           stream[idx].num_prims[1]));
 541 }
 542
 543 /*
 544  * R3 = R4 - R3;
 545  * R1 = R2 - R1;
 546  * R1 = R3 - R1;
 547  * R0 = R0 | R1;
 548  */
 549 static void
 550 calc_overflow_for_stream(struct iris_context *ice)
 551 {
 552    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 553    static const uint32_t maths[] = {
 554       MI_MATH | (17 - 2),
 555       MI_ALU2(LOAD, SRCA, R4),
 556       MI_ALU2(LOAD, SRCB, R3),
 557       MI_ALU0(SUB),
 558       MI_ALU2(STORE, R3, ACCU),
 559       MI_ALU2(LOAD, SRCA, R2),
 560       MI_ALU2(LOAD, SRCB, R1),
 561       MI_ALU0(SUB),
 562       MI_ALU2(STORE, R1, ACCU),
 563       MI_ALU2(LOAD, SRCA, R3),
 564       MI_ALU2(LOAD, SRCB, R1),
 565       MI_ALU0(SUB),
 566       MI_ALU2(STORE, R1, ACCU),
 567       MI_ALU2(LOAD, SRCA, R1),
 568       MI_ALU2(LOAD, SRCB, R0),
 569       MI_ALU0(OR),
 570       MI_ALU2(STORE, R0, ACCU),
 571    };
 572
 573    iris_batch_emit(batch, maths, sizeof(maths));
 574 }
 575
 576 static void
 577 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 578 {
 579    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 580
 581    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 582
 583    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 584       load_overflow_data_to_cs_gprs(ice, q, q->index);
 585       calc_overflow_for_stream(ice);
 586    } else {
 587       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 588          load_overflow_data_to_cs_gprs(ice, q, i);
 589          calc_overflow_for_stream(ice);
 590       }
 591    }
 592
 593    gpr0_to_bool(ice);
 594 }
 595
 596 /*
 597  * GPR0 = GPR0 & ((1ull << n) -1);
 598  */
 599 static void
 600 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
 601 {
 602    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 603
 604    ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 605    static const uint32_t math[] = {
 606       MI_MATH | (5 - 2),
 607       MI_ALU2(LOAD, SRCA, R0),
 608       MI_ALU2(LOAD, SRCB, R1),
 609       MI_ALU0(AND),
 610       MI_ALU2(STORE, R0, ACCU),
 611    };
 612    iris_batch_emit(batch, math, sizeof(math));
 613 }
 614
 615 /*
 616  * GPR0 = GPR0 << 30;
 617  */
 618 static void
 619 shl_gpr0_by_30_bits(struct iris_context *ice)
 620 {
 621    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 622    /* First we mask 34 bits of GPR0 to prevent overflow */
 623    keep_gpr0_lower_n_bits(ice, 34);
 624
 625    static const uint32_t shl_math[] = {
 626       MI_ALU2(LOAD, SRCA, R0),
 627       MI_ALU2(LOAD, SRCB, R0),
 628       MI_ALU0(ADD),
 629       MI_ALU2(STORE, R0, ACCU),
 630    };
 631
 632    const uint32_t outer_count = 5;
 633    const uint32_t inner_count = 6;
 634    const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
 635    const uint32_t batch_len = cmd_len * outer_count;
 636    uint32_t *map = iris_get_command_space(batch, batch_len * 4);
 637    uint32_t offset = 0;
 638    for (int o = 0; o < outer_count; o++) {
 639       map[offset++] = MI_MATH | (cmd_len - 2);
 640       for (int i = 0; i < inner_count; i++) {
 641          memcpy(&map[offset], shl_math, sizeof(shl_math));
 642          offset += 4;
 643       }
 644    }
 645 }
 646
 647 /*
 648  * GPR0 = GPR0 >> 2;
 649  *
 650  * Note that the upper 30 bits of GPR0 are lost!
 651  */
 652 static void
 653 shr_gpr0_by_2_bits(struct iris_context *ice)
 654 {
 655    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 656    shl_gpr0_by_30_bits(ice);
 657    ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 658    ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
 659 }
 660
 661 /**
 662  * Calculate the result and store it to CS_GPR0.
 663  */
 664 static void
 665 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 666 {
 667    struct iris_batch *batch = &ice->batches[q->batch_idx];
 668    struct iris_screen *screen = (void *) ice->ctx.screen;
 669    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 670
 671    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 672        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 673       overflow_result_to_gpr0(ice, q);
 674       return;
 675    }
 676
 677    if (q->type == PIPE_QUERY_TIMESTAMP) {
 678       ice->vtbl.load_register_mem64(batch, CS_GPR(0), q->bo,
 679                                     offsetof(struct iris_query_snapshots, start));
 680       /* TODO: This discards any fractional bits of the timebase scale.
 681        * We would need to do a bit of fixed point math on the CS ALU, or
 682        * launch an actual shader to calculate this with full precision.
 683        */
 684       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 685       keep_gpr0_lower_n_bits(ice, 36);
 686       return;
 687    }
 688
 689    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 690                                  offsetof(struct iris_query_snapshots, start));
 691    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 692                                  offsetof(struct iris_query_snapshots, end));
 693
 694    static const uint32_t math[] = {
 695       MI_MATH | (5 - 2),
 696       MI_ALU2(LOAD, SRCA, R2),
 697       MI_ALU2(LOAD, SRCB, R1),
 698       MI_ALU0(SUB),
 699       MI_ALU2(STORE, R0, ACCU),
 700    };
 701    iris_batch_emit(batch, math, sizeof(math));
 702
 703    /* WaDividePSInvocationCountBy4:HSW,BDW */
 704    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 7 && devinfo->gen == 8)
 705       shr_gpr0_by_2_bits(ice);
 706
 707    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 708        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 709       gpr0_to_bool(ice);
 710
 711    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
 712       /* TODO: This discards fractional bits (see above). */
 713       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 714    }
 715 }
 716
 717 static struct pipe_query *
 718 iris_create_query(struct pipe_context *ctx,
 719                   unsigned query_type,
 720                   unsigned index)
 721 {
 722    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 723
 724    q->type = query_type;
 725    q->index = index;
 726
 727    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
 728       q->batch_idx = IRIS_BATCH_COMPUTE;
 729    else
 730       q->batch_idx = IRIS_BATCH_RENDER;
 731    return (struct pipe_query *) q;
 732 }
 733
 734 static void
 735 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 736 {
 737    struct iris_query *query = (void *) p_query;
 738    iris_bo_unreference(query->bo);
 739    free(query);
 740 }
 741
 742
 743 static boolean
 744 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 745 {
 746    struct iris_screen *screen = (void *) ctx->screen;
 747    struct iris_context *ice = (void *) ctx;
 748    struct iris_query *q = (void *) query;
 749
 750    iris_bo_unreference(q->bo);
 751    q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
 752                          IRIS_MEMZONE_OTHER);
 753    if (!q->bo)
 754       return false;
 755
 756    q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
 757    if (!q->map)
 758       return false;
 759
 760    q->result = 0ull;
 761    q->ready = false;
 762    q->map->snapshots_landed = false;
 763
 764    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 765       ice->state.prims_generated_query_active = true;
 766       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 767    }
 768
 769    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 770        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 771       write_overflow_values(ice, q, false);
 772    else
 773       write_value(ice, q, offsetof(struct iris_query_snapshots, start));
 774
 775    return true;
 776 }
 777
 778 static bool
 779 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 780 {
 781    struct iris_context *ice = (void *) ctx;
 782    struct iris_query *q = (void *) query;
 783
 784    if (q->type == PIPE_QUERY_TIMESTAMP) {
 785       iris_begin_query(ctx, query);
 786       mark_available(ice, q);
 787       return true;
 788    }
 789
 790    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 791       ice->state.prims_generated_query_active = false;
 792       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 793    }
 794
 795    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 796        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 797       write_overflow_values(ice, q, true);
 798    else
 799       write_value(ice, q, offsetof(struct iris_query_snapshots, end));
 800    mark_available(ice, q);
 801
 802    return true;
 803 }
 804
 805 /**
 806  * See if the snapshots have landed for a query, and if so, compute the
 807  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 808  */
 809 static void
 810 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 811 {
 812    struct iris_screen *screen = (void *) ice->ctx.screen;
 813    const struct gen_device_info *devinfo = &screen->devinfo;
 814
 815    if (!q->ready && q->map->snapshots_landed) {
 816       calculate_result_on_cpu(devinfo, q);
 817    }
 818 }
 819
 820 static boolean
 821 iris_get_query_result(struct pipe_context *ctx,
 822                       struct pipe_query *query,
 823                       boolean wait,
 824                       union pipe_query_result *result)
 825 {
 826    struct iris_context *ice = (void *) ctx;
 827    struct iris_query *q = (void *) query;
 828    struct iris_screen *screen = (void *) ctx->screen;
 829    const struct gen_device_info *devinfo = &screen->devinfo;
 830
 831    if (!q->ready) {
 832       if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
 833          iris_batch_flush(&ice->batches[q->batch_idx]);
 834
 835       if (!q->map->snapshots_landed) {
 836          if (wait)
 837             iris_bo_wait_rendering(q->bo);
 838          else
 839             return false;
 840       }
 841
 842       assert(q->map->snapshots_landed);
 843       calculate_result_on_cpu(devinfo, q);
 844    }
 845
 846    assert(q->ready);
 847
 848    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
 849       switch (q->index) {
 850       case 0:
 851          result->pipeline_statistics.ia_vertices = q->result;
 852          break;
 853       case 1:
 854          result->pipeline_statistics.ia_primitives = q->result;
 855          break;
 856       case 2:
 857          result->pipeline_statistics.vs_invocations = q->result;
 858          break;
 859       case 3:
 860          result->pipeline_statistics.gs_invocations = q->result;
 861          break;
 862       case 4:
 863          result->pipeline_statistics.gs_primitives = q->result;
 864          break;
 865       case 5:
 866          result->pipeline_statistics.c_invocations = q->result;
 867          break;
 868       case 6:
 869          result->pipeline_statistics.c_primitives = q->result;
 870          break;
 871       case 7:
 872          result->pipeline_statistics.ps_invocations = q->result;
 873          break;
 874       case 8:
 875          result->pipeline_statistics.hs_invocations = q->result;
 876          break;
 877       case 9:
 878          result->pipeline_statistics.ds_invocations = q->result;
 879          break;
 880       case 10:
 881          result->pipeline_statistics.cs_invocations = q->result;
 882          break;
 883       }
 884    } else {
 885       result->u64 = q->result;
 886    }
 887
 888    return true;
 889 }
 890
 891 static void
 892 iris_get_query_result_resource(struct pipe_context *ctx,
 893                                struct pipe_query *query,
 894                                boolean wait,
 895                                enum pipe_query_value_type result_type,
 896                                int index,
 897                                struct pipe_resource *p_res,
 898                                unsigned offset)
 899 {
 900    struct iris_context *ice = (void *) ctx;
 901    struct iris_query *q = (void *) query;
 902    struct iris_batch *batch = &ice->batches[q->batch_idx];
 903    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 904    struct iris_resource *res = (void *) p_res;
 905    unsigned snapshots_landed_offset =
 906       offsetof(struct iris_query_snapshots, snapshots_landed);
 907
 908    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 909
 910    if (index == -1) {
 911       /* They're asking for the availability of the result.  If we still
 912        * have commands queued up which produce the result, submit them
 913        * now so that progress happens.  Either way, copy the snapshots
 914        * landed field to the destination resource.
 915        */
 916       if (iris_batch_references(batch, q->bo))
 917          iris_batch_flush(batch);
 918
 919       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 920                              q->bo, snapshots_landed_offset,
 921                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 922       return;
 923    }
 924
 925    if (!q->ready && q->map->snapshots_landed) {
 926       /* The final snapshots happen to have landed, so let's just compute
 927        * the result on the CPU now...
 928        */
 929       calculate_result_on_cpu(devinfo, q);
 930    }
 931
 932    if (q->ready) {
 933       /* We happen to have the result on the CPU, so just copy it. */
 934       if (result_type <= PIPE_QUERY_TYPE_U32) {
 935          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 936                                     q->result);
 937       } else {
 938          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 939                                     q->result);
 940       }
 941
 942       /* Make sure the result lands before they use bind the QBO elsewhere
 943        * and use the result.
 944        */
 945       // XXX: Why?  i965 doesn't do this.
 946       iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
 947       return;
 948    }
 949
 950    /* Calculate the result to CS_GPR0 */
 951    calculate_result_on_gpu(ice, q);
 952
 953    bool predicated = !wait && !q->stalled;
 954
 955    if (predicated) {
 956       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 957       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 958                                     snapshots_landed_offset);
 959       uint32_t predicate = MI_PREDICATE |
 960                            MI_PREDICATE_LOADOP_LOADINV |
 961                            MI_PREDICATE_COMBINEOP_SET |
 962                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 963       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 964    }
 965
 966    if (result_type <= PIPE_QUERY_TYPE_U32) {
 967       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 968                                      iris_resource_bo(p_res),
 969                                      offset, predicated);
 970    } else {
 971       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 972                                      iris_resource_bo(p_res),
 973                                      offset, predicated);
 974    }
 975 }
 976
 977 static void
 978 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 979 {
 980    struct iris_context *ice = (void *) ctx;
 981
 982    if (ice->state.statistics_counters_enabled == enable)
 983       return;
 984
 985    // XXX: most packets aren't paying attention to this yet, because it'd
 986    // have to be done dynamically at draw time, which is a pain
 987    ice->state.statistics_counters_enabled = enable;
 988    ice->state.dirty |= IRIS_DIRTY_CLIP |
 989                        IRIS_DIRTY_GS |
 990                        IRIS_DIRTY_RASTER |
 991                        IRIS_DIRTY_STREAMOUT |
 992                        IRIS_DIRTY_TCS |
 993                        IRIS_DIRTY_TES |
 994                        IRIS_DIRTY_VS |
 995                        IRIS_DIRTY_WM;
 996 }
 997
 998 static void
 999 set_predicate_enable(struct iris_context *ice, bool value)
1000 {
1001    if (value)
1002       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1003    else
1004       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1005 }
1006
1007 static void
1008 set_predicate_for_result(struct iris_context *ice,
1009                          struct iris_query *q,
1010                          bool inverted)
1011 {
1012    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1013
1014    /* The CPU doesn't have the query result yet; use hardware predication */
1015    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1016
1017    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1018    iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1019    q->stalled = true;
1020
1021    switch (q->type) {
1022    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1023    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1024       overflow_result_to_gpr0(ice, q);
1025
1026       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1027       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1028       break;
1029    default:
1030       /* PIPE_QUERY_OCCLUSION_* */
1031       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
1032          offsetof(struct iris_query_snapshots, start));
1033       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
1034          offsetof(struct iris_query_snapshots, end));
1035       break;
1036    }
1037
1038    uint32_t mi_predicate = MI_PREDICATE |
1039                            MI_PREDICATE_COMBINEOP_SET |
1040                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1041                            (inverted ? MI_PREDICATE_LOADOP_LOAD
1042                                      : MI_PREDICATE_LOADOP_LOADINV);
1043    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1044
1045    /* We immediately set the predicate on the render batch, as all the
1046     * counters come from 3D operations.  However, we may need to predicate
1047     * a compute dispatch, which executes in a different GEM context and has
1048     * a different MI_PREDICATE_DATA register.  So, we save the result to
1049     * memory and reload it in iris_launch_grid.
1050     */
1051    unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
1052    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1053                                   q->bo, offset, false);
1054    ice->state.compute_predicate = q->bo;
1055 }
1056
1057 static void
1058 iris_render_condition(struct pipe_context *ctx,
1059                       struct pipe_query *query,
1060                       boolean condition,
1061                       enum pipe_render_cond_flag mode)
1062 {
1063    struct iris_context *ice = (void *) ctx;
1064    struct iris_query *q = (void *) query;
1065
1066    if (!q) {
1067       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1068       return;
1069    }
1070
1071    iris_check_query_no_flush(ice, q);
1072
1073    if (q->result || q->ready) {
1074       set_predicate_enable(ice, (q->result != 0) ^ condition);
1075    } else {
1076       if (mode == PIPE_RENDER_COND_NO_WAIT ||
1077           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1078          perf_debug(&ice->dbg, "Conditional rendering demoted from "
1079                     "\"no wait\" to \"wait\".");
1080       }
1081       set_predicate_for_result(ice, q, condition);
1082    }
1083 }
1084
1085 void
1086 iris_init_query_functions(struct pipe_context *ctx)
1087 {
1088    ctx->create_query = iris_create_query;
1089    ctx->destroy_query = iris_destroy_query;
1090    ctx->begin_query = iris_begin_query;
1091    ctx->end_query = iris_end_query;
1092    ctx->get_query_result = iris_get_query_result;
1093    ctx->get_query_result_resource = iris_get_query_result_resource;
1094    ctx->set_active_query_state = iris_set_active_query_state;
1095    ctx->render_condition = iris_render_condition;
1096 }