src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "util/u_upload_mgr.h"
  39 #include "iris_context.h"
  40 #include "iris_defines.h"
  41 #include "iris_fence.h"
  42 #include "iris_resource.h"
  43 #include "iris_screen.h"
  44 #include "vulkan/util/vk_util.h"
  45
  46 #define IA_VERTICES_COUNT          0x2310
  47 #define IA_PRIMITIVES_COUNT        0x2318
  48 #define VS_INVOCATION_COUNT        0x2320
  49 #define HS_INVOCATION_COUNT        0x2300
  50 #define DS_INVOCATION_COUNT        0x2308
  51 #define GS_INVOCATION_COUNT        0x2328
  52 #define GS_PRIMITIVES_COUNT        0x2330
  53 #define CL_INVOCATION_COUNT        0x2338
  54 #define CL_PRIMITIVES_COUNT        0x2340
  55 #define PS_INVOCATION_COUNT        0x2348
  56 #define CS_INVOCATION_COUNT        0x2290
  57 #define PS_DEPTH_COUNT             0x2350
  58
  59 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  60
  61 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  62
  63 #define MI_MATH (0x1a << 23)
  64
  65 #define MI_ALU_LOAD      0x080
  66 #define MI_ALU_LOADINV   0x480
  67 #define MI_ALU_LOAD0     0x081
  68 #define MI_ALU_LOAD1     0x481
  69 #define MI_ALU_ADD       0x100
  70 #define MI_ALU_SUB       0x101
  71 #define MI_ALU_AND       0x102
  72 #define MI_ALU_OR        0x103
  73 #define MI_ALU_XOR       0x104
  74 #define MI_ALU_STORE     0x180
  75 #define MI_ALU_STOREINV  0x580
  76
  77 #define MI_ALU_R0        0x00
  78 #define MI_ALU_R1        0x01
  79 #define MI_ALU_R2        0x02
  80 #define MI_ALU_R3        0x03
  81 #define MI_ALU_R4        0x04
  82 #define MI_ALU_SRCA      0x20
  83 #define MI_ALU_SRCB      0x21
  84 #define MI_ALU_ACCU      0x31
  85 #define MI_ALU_ZF        0x32
  86 #define MI_ALU_CF        0x33
  87
  88 #define _MI_ALU(op, x, y)  (((op) << 20) | ((x) << 10) | (y))
  89
  90 #define _MI_ALU0(op)       _MI_ALU(MI_ALU_##op, 0, 0)
  91 #define _MI_ALU1(op, x)    _MI_ALU(MI_ALU_##op, x, 0)
  92 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
  93
  94 #define MI_ALU0(op)        _MI_ALU0(op)
  95 #define MI_ALU1(op, x)     _MI_ALU1(op, MI_ALU_##x)
  96 #define MI_ALU2(op, x, y)  _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
  97
  98 #define emit_lri32 ice->vtbl.load_register_imm32
  99 #define emit_lri64 ice->vtbl.load_register_imm64
 100 #define emit_lrr32 ice->vtbl.load_register_reg32
 101
 102 struct iris_query {
 103    enum pipe_query_type type;
 104    int index;
 105
 106    bool ready;
 107
 108    bool stalled;
 109
 110    uint64_t result;
 111
 112    struct iris_state_ref query_state_ref;
 113    struct iris_query_snapshots *map;
 114    struct iris_syncpt *syncpt;
 115
 116    int batch_idx;
 117 };
 118
 119 struct iris_query_snapshots {
 120    /** iris_render_condition's saved MI_PREDICATE_DATA value. */
 121    uint64_t predicate_data;
 122
 123    /** Have the start/end snapshots landed? */
 124    uint64_t snapshots_landed;
 125
 126    /** Starting and ending counter snapshots */
 127    uint64_t start;
 128    uint64_t end;
 129 };
 130
 131 struct iris_query_so_overflow {
 132    uint64_t predicate_data;
 133    uint64_t snapshots_landed;
 134
 135    struct {
 136       uint64_t prim_storage_needed[2];
 137       uint64_t num_prims[2];
 138    } stream[4];
 139 };
 140
 141 /**
 142  * Is this type of query written by PIPE_CONTROL?
 143  */
 144 static bool
 145 iris_is_query_pipelined(struct iris_query *q)
 146 {
 147    switch (q->type) {
 148    case PIPE_QUERY_OCCLUSION_COUNTER:
 149    case PIPE_QUERY_OCCLUSION_PREDICATE:
 150    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 151    case PIPE_QUERY_TIMESTAMP:
 152    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 153    case PIPE_QUERY_TIME_ELAPSED:
 154       return true;
 155
 156    default:
 157       return false;
 158    }
 159 }
 160
 161 static void
 162 mark_available(struct iris_context *ice, struct iris_query *q)
 163 {
 164    struct iris_batch *batch = &ice->batches[q->batch_idx];
 165    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 166    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 167    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 168    offset += q->query_state_ref.offset;
 169
 170    if (!iris_is_query_pipelined(q)) {
 171       ice->vtbl.store_data_imm64(batch, bo, offset, true);
 172    } else {
 173       /* Order available *after* the query results. */
 174       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 175       iris_emit_pipe_control_write(batch, flags, bo, offset, true);
 176    }
 177 }
 178
 179 /**
 180  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 181  */
 182 static void
 183 iris_pipelined_write(struct iris_batch *batch,
 184                      struct iris_query *q,
 185                      enum pipe_control_flags flags,
 186                      unsigned offset)
 187 {
 188    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 189    const unsigned optional_cs_stall =
 190       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 191    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 192
 193    iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
 194                                 bo, offset, 0ull);
 195 }
 196
 197 static void
 198 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 199 {
 200    struct iris_batch *batch = &ice->batches[q->batch_idx];
 201    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 202    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 203
 204    if (!iris_is_query_pipelined(q)) {
 205       iris_emit_pipe_control_flush(batch,
 206                                    PIPE_CONTROL_CS_STALL |
 207                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 208       q->stalled = true;
 209    }
 210
 211    switch (q->type) {
 212    case PIPE_QUERY_OCCLUSION_COUNTER:
 213    case PIPE_QUERY_OCCLUSION_PREDICATE:
 214    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 215       if (devinfo->gen >= 10) {
 216          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 217           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 218           *  Count sync operation."
 219           */
 220          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
 221       }
 222       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 223                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 224                            PIPE_CONTROL_DEPTH_STALL,
 225                            offset);
 226       break;
 227    case PIPE_QUERY_TIME_ELAPSED:
 228    case PIPE_QUERY_TIMESTAMP:
 229    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 230       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 231                            PIPE_CONTROL_WRITE_TIMESTAMP,
 232                            offset);
 233       break;
 234    case PIPE_QUERY_PRIMITIVES_GENERATED:
 235       ice->vtbl.store_register_mem64(batch,
 236                                      q->index == 0 ? CL_INVOCATION_COUNT :
 237                                      SO_PRIM_STORAGE_NEEDED(q->index),
 238                                      bo, offset, false);
 239       break;
 240    case PIPE_QUERY_PRIMITIVES_EMITTED:
 241       ice->vtbl.store_register_mem64(batch,
 242                                      SO_NUM_PRIMS_WRITTEN(q->index),
 243                                      bo, offset, false);
 244       break;
 245    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
 246       static const uint32_t index_to_reg[] = {
 247          IA_VERTICES_COUNT,
 248          IA_PRIMITIVES_COUNT,
 249          VS_INVOCATION_COUNT,
 250          GS_INVOCATION_COUNT,
 251          GS_PRIMITIVES_COUNT,
 252          CL_INVOCATION_COUNT,
 253          CL_PRIMITIVES_COUNT,
 254          PS_INVOCATION_COUNT,
 255          HS_INVOCATION_COUNT,
 256          DS_INVOCATION_COUNT,
 257          CS_INVOCATION_COUNT,
 258       };
 259       const uint32_t reg = index_to_reg[q->index];
 260
 261       ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
 262       break;
 263    }
 264    default:
 265       assert(false);
 266    }
 267 }
 268
 269 static void
 270 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 271 {
 272    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 273    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 274    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 275    uint32_t offset = q->query_state_ref.offset;
 276
 277    iris_emit_pipe_control_flush(batch,
 278                                 PIPE_CONTROL_CS_STALL |
 279                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 280    for (uint32_t i = 0; i < count; i++) {
 281       int s = q->index + i;
 282       int g_idx = offset + offsetof(struct iris_query_so_overflow,
 283                            stream[s].num_prims[end]);
 284       int w_idx = offset + offsetof(struct iris_query_so_overflow,
 285                            stream[s].prim_storage_needed[end]);
 286       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 287                                      bo, g_idx, false);
 288       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 289                                      bo, w_idx, false);
 290    }
 291 }
 292
 293 uint64_t
 294 iris_timebase_scale(const struct gen_device_info *devinfo,
 295                     uint64_t gpu_timestamp)
 296 {
 297    return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 298 }
 299
 300 static uint64_t
 301 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 302 {
 303    if (time0 > time1) {
 304       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 305    } else {
 306       return time1 - time0;
 307    }
 308 }
 309
 310 static bool
 311 stream_overflowed(struct iris_query_so_overflow *so, int s)
 312 {
 313    return (so->stream[s].prim_storage_needed[1] -
 314            so->stream[s].prim_storage_needed[0]) !=
 315           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 316 }
 317
 318 static void
 319 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 320                         struct iris_query *q)
 321 {
 322    switch (q->type) {
 323    case PIPE_QUERY_OCCLUSION_PREDICATE:
 324    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 325       q->result = q->map->end != q->map->start;
 326       break;
 327    case PIPE_QUERY_TIMESTAMP:
 328    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 329       /* The timestamp is the single starting snapshot. */
 330       q->result = iris_timebase_scale(devinfo, q->map->start);
 331       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 332       break;
 333    case PIPE_QUERY_TIME_ELAPSED:
 334       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 335       q->result = iris_timebase_scale(devinfo, q->result);
 336       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 337       break;
 338    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 339       q->result = stream_overflowed((void *) q->map, q->index);
 340       break;
 341    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 342       q->result = false;
 343       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 344          q->result |= stream_overflowed((void *) q->map, i);
 345       break;
 346    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
 347       q->result = q->map->end - q->map->start;
 348
 349       /* WaDividePSInvocationCountBy4:HSW,BDW */
 350       if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 351          q->result /= 4;
 352       break;
 353    case PIPE_QUERY_OCCLUSION_COUNTER:
 354    case PIPE_QUERY_PRIMITIVES_GENERATED:
 355    case PIPE_QUERY_PRIMITIVES_EMITTED:
 356    default:
 357       q->result = q->map->end - q->map->start;
 358       break;
 359    }
 360
 361    q->ready = true;
 362 }
 363
 364 static void
 365 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 366              unsigned reg_a, unsigned reg_b)
 367 {
 368    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 369
 370    math[0] = MI_MATH | (5 - 2);
 371    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 372    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 373    math[3] = _MI_ALU0(ADD);
 374    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 375 }
 376
 377 static void
 378 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 379              unsigned src_reg, unsigned shift)
 380 {
 381    assert(shift > 0);
 382
 383    int dwords = 1 + 4 * shift;
 384
 385    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 386
 387    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 388
 389    for (unsigned i = 0; i < shift; i++) {
 390       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 391       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 392       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 393       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 394       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 395    }
 396 }
 397
 398 /* Emit dwords to multiply GPR0 by N */
 399 static void
 400 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 401 {
 402    VK_OUTARRAY_MAKE(out, dw, dw_count);
 403
 404 #define APPEND_ALU(op, x, y) \
 405    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 406
 407    assert(N > 0);
 408    unsigned top_bit = 31 - __builtin_clz(N);
 409    for (int i = top_bit - 1; i >= 0; i--) {
 410       /* We get our initial data in GPR0 and we write the final data out to
 411        * GPR0 but we use GPR1 as our scratch register.
 412        */
 413       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 414       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 415
 416       /* Shift the current value left by 1 */
 417       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 418       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 419       APPEND_ALU(ADD, 0, 0);
 420
 421       if (N & (1 << i)) {
 422          /* Store ACCU to R1 and add R0 to R1 */
 423          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 424          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 425          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 426          APPEND_ALU(ADD, 0, 0);
 427       }
 428
 429       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 430    }
 431
 432 #undef APPEND_ALU
 433 }
 434
 435 static void
 436 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 437 {
 438    uint32_t num_dwords;
 439    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 440
 441    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 442    math[0] = MI_MATH | (num_dwords - 2);
 443    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 444 }
 445
 446 void
 447 iris_math_div32_gpr0(struct iris_context *ice,
 448                      struct iris_batch *batch,
 449                      uint32_t D)
 450 {
 451    /* Zero out the top of GPR0 */
 452    emit_lri32(batch, CS_GPR(0) + 4, 0);
 453
 454    if (D == 0) {
 455       /* This invalid, but we should do something so we set GPR0 to 0. */
 456       emit_lri32(batch, CS_GPR(0), 0);
 457    } else if (util_is_power_of_two_or_zero(D)) {
 458       unsigned log2_D = util_logbase2(D);
 459       assert(log2_D < 32);
 460       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 461        * the top 32 bits of the result.
 462        */
 463       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 464       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 465       emit_lri32(batch, CS_GPR(0) + 4, 0);
 466    } else {
 467       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 468       assert(m.multiplier <= UINT32_MAX);
 469
 470       if (m.pre_shift) {
 471          /* We right-shift by L by left-shifting by 32 - l and taking the top
 472           * 32 bits of the result.
 473           */
 474          if (m.pre_shift < 32)
 475             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 476          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 477          emit_lri32(batch, CS_GPR(0) + 4, 0);
 478       }
 479
 480       /* Do the 32x32 multiply into gpr0 */
 481       emit_mul_gpr0(batch, m.multiplier);
 482
 483       if (m.increment) {
 484          /* If we need to increment, save off a copy of GPR0 */
 485          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 486          emit_lri32(batch, CS_GPR(1) + 4, 0);
 487          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 488       }
 489
 490       /* Shift by 32 */
 491       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 492       emit_lri32(batch, CS_GPR(0) + 4, 0);
 493
 494       if (m.post_shift) {
 495          /* We right-shift by L by left-shifting by 32 - l and taking the top
 496           * 32 bits of the result.
 497           */
 498          if (m.post_shift < 32)
 499             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 500          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 501          emit_lri32(batch, CS_GPR(0) + 4, 0);
 502       }
 503    }
 504 }
 505
 506 /*
 507  * GPR0 = (GPR0 == 0) ? 0 : 1;
 508  */
 509 static void
 510 gpr0_to_bool(struct iris_context *ice)
 511 {
 512    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 513
 514    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 515
 516    static const uint32_t math[] = {
 517       MI_MATH | (9 - 2),
 518       MI_ALU2(LOAD, SRCA, R0),
 519       MI_ALU1(LOAD0, SRCB),
 520       MI_ALU0(ADD),
 521       MI_ALU2(STOREINV, R0, ZF),
 522       MI_ALU2(LOAD, SRCA, R0),
 523       MI_ALU2(LOAD, SRCB, R1),
 524       MI_ALU0(AND),
 525       MI_ALU2(STORE, R0, ACCU),
 526    };
 527    iris_batch_emit(batch, math, sizeof(math));
 528 }
 529
 530 static void
 531 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 532                               struct iris_query *q,
 533                               int idx)
 534 {
 535    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 536    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 537    uint32_t offset = q->query_state_ref.offset;
 538
 539    ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
 540                                  offsetof(struct iris_query_so_overflow,
 541                                           stream[idx].prim_storage_needed[0]));
 542    ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
 543                                  offsetof(struct iris_query_so_overflow,
 544                                           stream[idx].prim_storage_needed[1]));
 545
 546    ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
 547                                  offsetof(struct iris_query_so_overflow,
 548                                           stream[idx].num_prims[0]));
 549    ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
 550                                  offsetof(struct iris_query_so_overflow,
 551                                           stream[idx].num_prims[1]));
 552 }
 553
 554 /*
 555  * R3 = R4 - R3;
 556  * R1 = R2 - R1;
 557  * R1 = R3 - R1;
 558  * R0 = R0 | R1;
 559  */
 560 static void
 561 calc_overflow_for_stream(struct iris_context *ice)
 562 {
 563    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 564    static const uint32_t maths[] = {
 565       MI_MATH | (17 - 2),
 566       MI_ALU2(LOAD, SRCA, R4),
 567       MI_ALU2(LOAD, SRCB, R3),
 568       MI_ALU0(SUB),
 569       MI_ALU2(STORE, R3, ACCU),
 570       MI_ALU2(LOAD, SRCA, R2),
 571       MI_ALU2(LOAD, SRCB, R1),
 572       MI_ALU0(SUB),
 573       MI_ALU2(STORE, R1, ACCU),
 574       MI_ALU2(LOAD, SRCA, R3),
 575       MI_ALU2(LOAD, SRCB, R1),
 576       MI_ALU0(SUB),
 577       MI_ALU2(STORE, R1, ACCU),
 578       MI_ALU2(LOAD, SRCA, R1),
 579       MI_ALU2(LOAD, SRCB, R0),
 580       MI_ALU0(OR),
 581       MI_ALU2(STORE, R0, ACCU),
 582    };
 583
 584    iris_batch_emit(batch, maths, sizeof(maths));
 585 }
 586
 587 static void
 588 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 589 {
 590    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 591
 592    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 593
 594    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 595       load_overflow_data_to_cs_gprs(ice, q, q->index);
 596       calc_overflow_for_stream(ice);
 597    } else {
 598       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 599          load_overflow_data_to_cs_gprs(ice, q, i);
 600          calc_overflow_for_stream(ice);
 601       }
 602    }
 603
 604    gpr0_to_bool(ice);
 605 }
 606
 607 /*
 608  * GPR0 = GPR0 & ((1ull << n) -1);
 609  */
 610 static void
 611 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
 612 {
 613    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 614
 615    ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 616    static const uint32_t math[] = {
 617       MI_MATH | (5 - 2),
 618       MI_ALU2(LOAD, SRCA, R0),
 619       MI_ALU2(LOAD, SRCB, R1),
 620       MI_ALU0(AND),
 621       MI_ALU2(STORE, R0, ACCU),
 622    };
 623    iris_batch_emit(batch, math, sizeof(math));
 624 }
 625
 626 /*
 627  * GPR0 = GPR0 << 30;
 628  */
 629 static void
 630 shl_gpr0_by_30_bits(struct iris_context *ice)
 631 {
 632    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 633    /* First we mask 34 bits of GPR0 to prevent overflow */
 634    keep_gpr0_lower_n_bits(ice, 34);
 635
 636    static const uint32_t shl_math[] = {
 637       MI_ALU2(LOAD, SRCA, R0),
 638       MI_ALU2(LOAD, SRCB, R0),
 639       MI_ALU0(ADD),
 640       MI_ALU2(STORE, R0, ACCU),
 641    };
 642
 643    const uint32_t outer_count = 5;
 644    const uint32_t inner_count = 6;
 645    const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
 646    const uint32_t batch_len = cmd_len * outer_count;
 647    uint32_t *map = iris_get_command_space(batch, batch_len * 4);
 648    uint32_t offset = 0;
 649    for (int o = 0; o < outer_count; o++) {
 650       map[offset++] = MI_MATH | (cmd_len - 2);
 651       for (int i = 0; i < inner_count; i++) {
 652          memcpy(&map[offset], shl_math, sizeof(shl_math));
 653          offset += 4;
 654       }
 655    }
 656 }
 657
 658 /*
 659  * GPR0 = GPR0 >> 2;
 660  *
 661  * Note that the upper 30 bits of GPR0 are lost!
 662  */
 663 static void
 664 shr_gpr0_by_2_bits(struct iris_context *ice)
 665 {
 666    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 667    shl_gpr0_by_30_bits(ice);
 668    ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 669    ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
 670 }
 671
 672 /**
 673  * Calculate the result and store it to CS_GPR0.
 674  */
 675 static void
 676 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 677 {
 678    struct iris_batch *batch = &ice->batches[q->batch_idx];
 679    struct iris_screen *screen = (void *) ice->ctx.screen;
 680    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 681    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 682    uint32_t offset = q->query_state_ref.offset;
 683
 684    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 685        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 686       overflow_result_to_gpr0(ice, q);
 687       return;
 688    }
 689
 690    if (q->type == PIPE_QUERY_TIMESTAMP) {
 691       ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
 692                                     offset +
 693                                     offsetof(struct iris_query_snapshots, start));
 694       /* TODO: This discards any fractional bits of the timebase scale.
 695        * We would need to do a bit of fixed point math on the CS ALU, or
 696        * launch an actual shader to calculate this with full precision.
 697        */
 698       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 699       keep_gpr0_lower_n_bits(ice, 36);
 700       return;
 701    }
 702
 703    ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
 704                                  offset +
 705                                  offsetof(struct iris_query_snapshots, start));
 706    ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
 707                                  offset +
 708                                  offsetof(struct iris_query_snapshots, end));
 709
 710    static const uint32_t math[] = {
 711       MI_MATH | (5 - 2),
 712       MI_ALU2(LOAD, SRCA, R2),
 713       MI_ALU2(LOAD, SRCB, R1),
 714       MI_ALU0(SUB),
 715       MI_ALU2(STORE, R0, ACCU),
 716    };
 717    iris_batch_emit(batch, math, sizeof(math));
 718
 719    /* WaDividePSInvocationCountBy4:HSW,BDW */
 720    if (devinfo->gen == 8 &&
 721        q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 722        q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 723       shr_gpr0_by_2_bits(ice);
 724
 725    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 726        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 727       gpr0_to_bool(ice);
 728
 729    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
 730       /* TODO: This discards fractional bits (see above). */
 731       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 732    }
 733 }
 734
 735 static struct pipe_query *
 736 iris_create_query(struct pipe_context *ctx,
 737                   unsigned query_type,
 738                   unsigned index)
 739 {
 740    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 741
 742    q->type = query_type;
 743    q->index = index;
 744
 745    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 746        q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
 747       q->batch_idx = IRIS_BATCH_COMPUTE;
 748    else
 749       q->batch_idx = IRIS_BATCH_RENDER;
 750    return (struct pipe_query *) q;
 751 }
 752
 753 static void
 754 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 755 {
 756    struct iris_query *query = (void *) p_query;
 757    struct iris_screen *screen = (void *) ctx->screen;
 758    iris_syncpt_reference(screen, &query->syncpt, NULL);
 759    free(query);
 760 }
 761
 762
 763 static boolean
 764 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 765 {
 766    struct iris_context *ice = (void *) ctx;
 767    struct iris_query *q = (void *) query;
 768    void *ptr = NULL;
 769    uint32_t size;
 770
 771    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 772        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 773       size = sizeof(struct iris_query_so_overflow);
 774    else
 775       size = sizeof(struct iris_query_snapshots);
 776
 777    u_upload_alloc(ice->query_buffer_uploader, 0,
 778                   size, size, &q->query_state_ref.offset,
 779                   &q->query_state_ref.res, &ptr);
 780
 781    if (!iris_resource_bo(q->query_state_ref.res))
 782       return false;
 783
 784    q->map = ptr;
 785    if (!q->map)
 786       return false;
 787
 788    q->result = 0ull;
 789    q->ready = false;
 790    q->map->snapshots_landed = false;
 791
 792    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 793       ice->state.prims_generated_query_active = true;
 794       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 795    }
 796
 797    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 798        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 799       write_overflow_values(ice, q, false);
 800    else
 801       write_value(ice, q,
 802                   q->query_state_ref.offset +
 803                   offsetof(struct iris_query_snapshots, start));
 804
 805    return true;
 806 }
 807
 808 static bool
 809 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 810 {
 811    struct iris_context *ice = (void *) ctx;
 812    struct iris_query *q = (void *) query;
 813    struct iris_batch *batch = &ice->batches[q->batch_idx];
 814    struct iris_screen *screen = (void *) ctx->screen;
 815
 816    if (q->type == PIPE_QUERY_TIMESTAMP) {
 817       iris_begin_query(ctx, query);
 818       struct iris_syncpt *syncpt =
 819          ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
 820       iris_syncpt_reference(screen, &q->syncpt, syncpt);
 821       mark_available(ice, q);
 822       return true;
 823    }
 824
 825    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 826       ice->state.prims_generated_query_active = false;
 827       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 828    }
 829
 830    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 831        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 832       write_overflow_values(ice, q, true);
 833    else
 834       write_value(ice, q,
 835                   q->query_state_ref.offset +
 836                   offsetof(struct iris_query_snapshots, end));
 837
 838    struct iris_syncpt *syncpt =
 839       ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
 840    iris_syncpt_reference(screen, &q->syncpt, syncpt);
 841    mark_available(ice, q);
 842
 843    return true;
 844 }
 845
 846 /**
 847  * See if the snapshots have landed for a query, and if so, compute the
 848  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 849  */
 850 static void
 851 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 852 {
 853    struct iris_screen *screen = (void *) ice->ctx.screen;
 854    const struct gen_device_info *devinfo = &screen->devinfo;
 855
 856    if (!q->ready && q->map->snapshots_landed) {
 857       calculate_result_on_cpu(devinfo, q);
 858    }
 859 }
 860
 861 static boolean
 862 iris_get_query_result(struct pipe_context *ctx,
 863                       struct pipe_query *query,
 864                       boolean wait,
 865                       union pipe_query_result *result)
 866 {
 867    struct iris_context *ice = (void *) ctx;
 868    struct iris_query *q = (void *) query;
 869    struct iris_screen *screen = (void *) ctx->screen;
 870    const struct gen_device_info *devinfo = &screen->devinfo;
 871    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 872
 873    if (!q->ready) {
 874       if (iris_batch_references(&ice->batches[q->batch_idx], bo))
 875          iris_batch_flush(&ice->batches[q->batch_idx]);
 876
 877       while (!q->map->snapshots_landed) {
 878          if (wait)
 879             iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
 880          else
 881             return false;
 882       }
 883
 884       assert(q->map->snapshots_landed);
 885       calculate_result_on_cpu(devinfo, q);
 886    }
 887
 888    assert(q->ready);
 889
 890    result->u64 = q->result;
 891
 892    return true;
 893 }
 894
 895 static void
 896 iris_get_query_result_resource(struct pipe_context *ctx,
 897                                struct pipe_query *query,
 898                                boolean wait,
 899                                enum pipe_query_value_type result_type,
 900                                int index,
 901                                struct pipe_resource *p_res,
 902                                unsigned offset)
 903 {
 904    struct iris_context *ice = (void *) ctx;
 905    struct iris_query *q = (void *) query;
 906    struct iris_batch *batch = &ice->batches[q->batch_idx];
 907    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 908    struct iris_resource *res = (void *) p_res;
 909    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 910    unsigned snapshots_landed_offset =
 911       offsetof(struct iris_query_snapshots, snapshots_landed);
 912
 913    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 914
 915    if (index == -1) {
 916       /* They're asking for the availability of the result.  If we still
 917        * have commands queued up which produce the result, submit them
 918        * now so that progress happens.  Either way, copy the snapshots
 919        * landed field to the destination resource.
 920        */
 921       if (iris_batch_references(batch, bo))
 922          iris_batch_flush(batch);
 923
 924       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 925                              bo, snapshots_landed_offset,
 926                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 927       return;
 928    }
 929
 930    if (!q->ready && q->map->snapshots_landed) {
 931       /* The final snapshots happen to have landed, so let's just compute
 932        * the result on the CPU now...
 933        */
 934       calculate_result_on_cpu(devinfo, q);
 935    }
 936
 937    if (q->ready) {
 938       /* We happen to have the result on the CPU, so just copy it. */
 939       if (result_type <= PIPE_QUERY_TYPE_U32) {
 940          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 941                                     q->result);
 942       } else {
 943          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 944                                     q->result);
 945       }
 946
 947       /* Make sure the result lands before they use bind the QBO elsewhere
 948        * and use the result.
 949        */
 950       // XXX: Why?  i965 doesn't do this.
 951       iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
 952       return;
 953    }
 954
 955    /* Calculate the result to CS_GPR0 */
 956    calculate_result_on_gpu(ice, q);
 957
 958    bool predicated = !wait && !q->stalled;
 959
 960    if (predicated) {
 961       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 962       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
 963                                     snapshots_landed_offset);
 964       uint32_t predicate = MI_PREDICATE |
 965                            MI_PREDICATE_LOADOP_LOADINV |
 966                            MI_PREDICATE_COMBINEOP_SET |
 967                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 968       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 969    }
 970
 971    if (result_type <= PIPE_QUERY_TYPE_U32) {
 972       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 973                                      iris_resource_bo(p_res),
 974                                      offset, predicated);
 975    } else {
 976       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 977                                      iris_resource_bo(p_res),
 978                                      offset, predicated);
 979    }
 980 }
 981
 982 static void
 983 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 984 {
 985    struct iris_context *ice = (void *) ctx;
 986
 987    if (ice->state.statistics_counters_enabled == enable)
 988       return;
 989
 990    // XXX: most packets aren't paying attention to this yet, because it'd
 991    // have to be done dynamically at draw time, which is a pain
 992    ice->state.statistics_counters_enabled = enable;
 993    ice->state.dirty |= IRIS_DIRTY_CLIP |
 994                        IRIS_DIRTY_GS |
 995                        IRIS_DIRTY_RASTER |
 996                        IRIS_DIRTY_STREAMOUT |
 997                        IRIS_DIRTY_TCS |
 998                        IRIS_DIRTY_TES |
 999                        IRIS_DIRTY_VS |
1000                        IRIS_DIRTY_WM;
1001 }
1002
1003 static void
1004 set_predicate_enable(struct iris_context *ice, bool value)
1005 {
1006    if (value)
1007       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1008    else
1009       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1010 }
1011
1012 static void
1013 set_predicate_for_result(struct iris_context *ice,
1014                          struct iris_query *q,
1015                          bool inverted)
1016 {
1017    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1018    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1019
1020    /* The CPU doesn't have the query result yet; use hardware predication */
1021    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1022
1023    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1024    iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1025    q->stalled = true;
1026
1027    switch (q->type) {
1028    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1029    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1030       overflow_result_to_gpr0(ice, q);
1031
1032       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1033       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1034       break;
1035    default:
1036       /* PIPE_QUERY_OCCLUSION_* */
1037       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1038          offsetof(struct iris_query_snapshots, start) +
1039          q->query_state_ref.offset);
1040       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1041          offsetof(struct iris_query_snapshots, end) +
1042          q->query_state_ref.offset);
1043       break;
1044    }
1045
1046    uint32_t mi_predicate = MI_PREDICATE |
1047                            MI_PREDICATE_COMBINEOP_SET |
1048                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1049                            (inverted ? MI_PREDICATE_LOADOP_LOAD
1050                                      : MI_PREDICATE_LOADOP_LOADINV);
1051    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1052
1053    /* We immediately set the predicate on the render batch, as all the
1054     * counters come from 3D operations.  However, we may need to predicate
1055     * a compute dispatch, which executes in a different GEM context and has
1056     * a different MI_PREDICATE_DATA register.  So, we save the result to
1057     * memory and reload it in iris_launch_grid.
1058     */
1059    unsigned offset = q->query_state_ref.offset +
1060                      offsetof(struct iris_query_snapshots, predicate_data);
1061    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
1062                                   bo, offset, false);
1063    ice->state.compute_predicate = bo;
1064 }
1065
1066 static void
1067 iris_render_condition(struct pipe_context *ctx,
1068                       struct pipe_query *query,
1069                       boolean condition,
1070                       enum pipe_render_cond_flag mode)
1071 {
1072    struct iris_context *ice = (void *) ctx;
1073    struct iris_query *q = (void *) query;
1074
1075    /* The old condition isn't relevant; we'll update it if necessary */
1076    ice->state.compute_predicate = NULL;
1077
1078    if (!q) {
1079       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1080       return;
1081    }
1082
1083    iris_check_query_no_flush(ice, q);
1084
1085    if (q->result || q->ready) {
1086       set_predicate_enable(ice, (q->result != 0) ^ condition);
1087    } else {
1088       if (mode == PIPE_RENDER_COND_NO_WAIT ||
1089           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1090          perf_debug(&ice->dbg, "Conditional rendering demoted from "
1091                     "\"no wait\" to \"wait\".");
1092       }
1093       set_predicate_for_result(ice, q, condition);
1094    }
1095 }
1096
1097 void
1098 iris_init_query_functions(struct pipe_context *ctx)
1099 {
1100    ctx->create_query = iris_create_query;
1101    ctx->destroy_query = iris_destroy_query;
1102    ctx->begin_query = iris_begin_query;
1103    ctx->end_query = iris_end_query;
1104    ctx->get_query_result = iris_get_query_result;
1105    ctx->get_query_result_resource = iris_get_query_result_resource;
1106    ctx->set_active_query_state = iris_set_active_query_state;
1107    ctx->render_condition = iris_render_condition;
1108 }