src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "util/u_upload_mgr.h"
  39 #include "iris_context.h"
  40 #include "iris_defines.h"
  41 #include "iris_fence.h"
  42 #include "iris_resource.h"
  43 #include "iris_screen.h"
  44 #include "vulkan/util/vk_util.h"
  45
  46 #define IA_VERTICES_COUNT          0x2310
  47 #define IA_PRIMITIVES_COUNT        0x2318
  48 #define VS_INVOCATION_COUNT        0x2320
  49 #define HS_INVOCATION_COUNT        0x2300
  50 #define DS_INVOCATION_COUNT        0x2308
  51 #define GS_INVOCATION_COUNT        0x2328
  52 #define GS_PRIMITIVES_COUNT        0x2330
  53 #define CL_INVOCATION_COUNT        0x2338
  54 #define CL_PRIMITIVES_COUNT        0x2340
  55 #define PS_INVOCATION_COUNT        0x2348
  56 #define CS_INVOCATION_COUNT        0x2290
  57 #define PS_DEPTH_COUNT             0x2350
  58
  59 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  60
  61 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  62
  63 #define MI_MATH (0x1a << 23)
  64
  65 #define MI_ALU_LOAD      0x080
  66 #define MI_ALU_LOADINV   0x480
  67 #define MI_ALU_LOAD0     0x081
  68 #define MI_ALU_LOAD1     0x481
  69 #define MI_ALU_ADD       0x100
  70 #define MI_ALU_SUB       0x101
  71 #define MI_ALU_AND       0x102
  72 #define MI_ALU_OR        0x103
  73 #define MI_ALU_XOR       0x104
  74 #define MI_ALU_STORE     0x180
  75 #define MI_ALU_STOREINV  0x580
  76
  77 #define MI_ALU_SRCA      0x20
  78 #define MI_ALU_SRCB      0x21
  79 #define MI_ALU_ACCU      0x31
  80 #define MI_ALU_ZF        0x32
  81 #define MI_ALU_CF        0x33
  82
  83 #define emit_lri32 ice->vtbl.load_register_imm32
  84 #define emit_lri64 ice->vtbl.load_register_imm64
  85 #define emit_lrr32 ice->vtbl.load_register_reg32
  86
  87 struct iris_query {
  88    enum pipe_query_type type;
  89    int index;
  90
  91    bool ready;
  92
  93    bool stalled;
  94
  95    uint64_t result;
  96
  97    struct iris_state_ref query_state_ref;
  98    struct iris_query_snapshots *map;
  99    struct iris_syncpt *syncpt;
 100
 101    int batch_idx;
 102 };
 103
 104 struct iris_query_snapshots {
 105    /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
 106    uint64_t predicate_result;
 107
 108    /** Have the start/end snapshots landed? */
 109    uint64_t snapshots_landed;
 110
 111    /** Starting and ending counter snapshots */
 112    uint64_t start;
 113    uint64_t end;
 114 };
 115
 116 struct iris_query_so_overflow {
 117    uint64_t predicate_result;
 118    uint64_t snapshots_landed;
 119
 120    struct {
 121       uint64_t prim_storage_needed[2];
 122       uint64_t num_prims[2];
 123    } stream[4];
 124 };
 125
 126 /**
 127  * Is this type of query written by PIPE_CONTROL?
 128  */
 129 static bool
 130 iris_is_query_pipelined(struct iris_query *q)
 131 {
 132    switch (q->type) {
 133    case PIPE_QUERY_OCCLUSION_COUNTER:
 134    case PIPE_QUERY_OCCLUSION_PREDICATE:
 135    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 136    case PIPE_QUERY_TIMESTAMP:
 137    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 138    case PIPE_QUERY_TIME_ELAPSED:
 139       return true;
 140
 141    default:
 142       return false;
 143    }
 144 }
 145
 146 static void
 147 mark_available(struct iris_context *ice, struct iris_query *q)
 148 {
 149    struct iris_batch *batch = &ice->batches[q->batch_idx];
 150    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 151    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 152    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 153    offset += q->query_state_ref.offset;
 154
 155    if (!iris_is_query_pipelined(q)) {
 156       ice->vtbl.store_data_imm64(batch, bo, offset, true);
 157    } else {
 158       /* Order available *after* the query results. */
 159       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 160       iris_emit_pipe_control_write(batch, "query: mark available",
 161                                    flags, bo, offset, true);
 162    }
 163 }
 164
 165 /**
 166  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 167  */
 168 static void
 169 iris_pipelined_write(struct iris_batch *batch,
 170                      struct iris_query *q,
 171                      enum pipe_control_flags flags,
 172                      unsigned offset)
 173 {
 174    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 175    const unsigned optional_cs_stall =
 176       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 177    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 178
 179    iris_emit_pipe_control_write(batch, "query: pipelined snapshot write",
 180                                 flags | optional_cs_stall,
 181                                 bo, offset, 0ull);
 182 }
 183
 184 static void
 185 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 186 {
 187    struct iris_batch *batch = &ice->batches[q->batch_idx];
 188    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 189    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 190
 191    if (!iris_is_query_pipelined(q)) {
 192       iris_emit_pipe_control_flush(batch,
 193                                    "query: non-pipelined snapshot write",
 194                                    PIPE_CONTROL_CS_STALL |
 195                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 196       q->stalled = true;
 197    }
 198
 199    switch (q->type) {
 200    case PIPE_QUERY_OCCLUSION_COUNTER:
 201    case PIPE_QUERY_OCCLUSION_PREDICATE:
 202    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 203       if (devinfo->gen >= 10) {
 204          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 205           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 206           *  Count sync operation."
 207           */
 208          iris_emit_pipe_control_flush(batch,
 209                                       "workaround: depth stall before writing "
 210                                       "PS_DEPTH_COUNT",
 211                                       PIPE_CONTROL_DEPTH_STALL);
 212       }
 213       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 214                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 215                            PIPE_CONTROL_DEPTH_STALL,
 216                            offset);
 217       break;
 218    case PIPE_QUERY_TIME_ELAPSED:
 219    case PIPE_QUERY_TIMESTAMP:
 220    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 221       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 222                            PIPE_CONTROL_WRITE_TIMESTAMP,
 223                            offset);
 224       break;
 225    case PIPE_QUERY_PRIMITIVES_GENERATED:
 226       ice->vtbl.store_register_mem64(batch,
 227                                      q->index == 0 ? CL_INVOCATION_COUNT :
 228                                      SO_PRIM_STORAGE_NEEDED(q->index),
 229                                      bo, offset, false);
 230       break;
 231    case PIPE_QUERY_PRIMITIVES_EMITTED:
 232       ice->vtbl.store_register_mem64(batch,
 233                                      SO_NUM_PRIMS_WRITTEN(q->index),
 234                                      bo, offset, false);
 235       break;
 236    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
 237       static const uint32_t index_to_reg[] = {
 238          IA_VERTICES_COUNT,
 239          IA_PRIMITIVES_COUNT,
 240          VS_INVOCATION_COUNT,
 241          GS_INVOCATION_COUNT,
 242          GS_PRIMITIVES_COUNT,
 243          CL_INVOCATION_COUNT,
 244          CL_PRIMITIVES_COUNT,
 245          PS_INVOCATION_COUNT,
 246          HS_INVOCATION_COUNT,
 247          DS_INVOCATION_COUNT,
 248          CS_INVOCATION_COUNT,
 249       };
 250       const uint32_t reg = index_to_reg[q->index];
 251
 252       ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
 253       break;
 254    }
 255    default:
 256       assert(false);
 257    }
 258 }
 259
 260 static void
 261 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 262 {
 263    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 264    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 265    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 266    uint32_t offset = q->query_state_ref.offset;
 267
 268    iris_emit_pipe_control_flush(batch,
 269                                 "query: write SO overflow snapshots",
 270                                 PIPE_CONTROL_CS_STALL |
 271                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 272    for (uint32_t i = 0; i < count; i++) {
 273       int s = q->index + i;
 274       int g_idx = offset + offsetof(struct iris_query_so_overflow,
 275                            stream[s].num_prims[end]);
 276       int w_idx = offset + offsetof(struct iris_query_so_overflow,
 277                            stream[s].prim_storage_needed[end]);
 278       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 279                                      bo, g_idx, false);
 280       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 281                                      bo, w_idx, false);
 282    }
 283 }
 284
 285 static uint64_t
 286 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 287 {
 288    if (time0 > time1) {
 289       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 290    } else {
 291       return time1 - time0;
 292    }
 293 }
 294
 295 static bool
 296 stream_overflowed(struct iris_query_so_overflow *so, int s)
 297 {
 298    return (so->stream[s].prim_storage_needed[1] -
 299            so->stream[s].prim_storage_needed[0]) !=
 300           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 301 }
 302
 303 static void
 304 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 305                         struct iris_query *q)
 306 {
 307    switch (q->type) {
 308    case PIPE_QUERY_OCCLUSION_PREDICATE:
 309    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 310       q->result = q->map->end != q->map->start;
 311       break;
 312    case PIPE_QUERY_TIMESTAMP:
 313    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 314       /* The timestamp is the single starting snapshot. */
 315       q->result = gen_device_info_timebase_scale(devinfo, q->map->start);
 316       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 317       break;
 318    case PIPE_QUERY_TIME_ELAPSED:
 319       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 320       q->result = gen_device_info_timebase_scale(devinfo, q->result);
 321       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 322       break;
 323    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 324       q->result = stream_overflowed((void *) q->map, q->index);
 325       break;
 326    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 327       q->result = false;
 328       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 329          q->result |= stream_overflowed((void *) q->map, i);
 330       break;
 331    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
 332       q->result = q->map->end - q->map->start;
 333
 334       /* WaDividePSInvocationCountBy4:HSW,BDW */
 335       if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 336          q->result /= 4;
 337       break;
 338    case PIPE_QUERY_OCCLUSION_COUNTER:
 339    case PIPE_QUERY_PRIMITIVES_GENERATED:
 340    case PIPE_QUERY_PRIMITIVES_EMITTED:
 341    default:
 342       q->result = q->map->end - q->map->start;
 343       break;
 344    }
 345
 346    q->ready = true;
 347 }
 348
 349 static void
 350 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 351              unsigned reg_a, unsigned reg_b)
 352 {
 353    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 354
 355    math[0] = MI_MATH | (5 - 2);
 356    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 357    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 358    math[3] = _MI_ALU0(ADD);
 359    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 360 }
 361
 362 static void
 363 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 364              unsigned src_reg, unsigned shift)
 365 {
 366    assert(shift > 0);
 367
 368    int dwords = 1 + 4 * shift;
 369
 370    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 371
 372    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 373
 374    for (unsigned i = 0; i < shift; i++) {
 375       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 376       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 377       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 378       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 379       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 380    }
 381 }
 382
 383 /* Emit dwords to multiply GPR0 by N */
 384 static void
 385 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 386 {
 387    VK_OUTARRAY_MAKE(out, dw, dw_count);
 388
 389 #define APPEND_ALU(op, x, y) \
 390    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 391
 392    assert(N > 0);
 393    unsigned top_bit = 31 - __builtin_clz(N);
 394    for (int i = top_bit - 1; i >= 0; i--) {
 395       /* We get our initial data in GPR0 and we write the final data out to
 396        * GPR0 but we use GPR1 as our scratch register.
 397        */
 398       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 399       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 400
 401       /* Shift the current value left by 1 */
 402       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 403       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 404       APPEND_ALU(ADD, 0, 0);
 405
 406       if (N & (1 << i)) {
 407          /* Store ACCU to R1 and add R0 to R1 */
 408          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 409          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 410          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 411          APPEND_ALU(ADD, 0, 0);
 412       }
 413
 414       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 415    }
 416
 417 #undef APPEND_ALU
 418 }
 419
 420 static void
 421 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 422 {
 423    uint32_t num_dwords;
 424    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 425
 426    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 427    math[0] = MI_MATH | (num_dwords - 2);
 428    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 429 }
 430
 431 void
 432 iris_math_div32_gpr0(struct iris_context *ice,
 433                      struct iris_batch *batch,
 434                      uint32_t D)
 435 {
 436    /* Zero out the top of GPR0 */
 437    emit_lri32(batch, CS_GPR(0) + 4, 0);
 438
 439    if (D == 0) {
 440       /* This invalid, but we should do something so we set GPR0 to 0. */
 441       emit_lri32(batch, CS_GPR(0), 0);
 442    } else if (util_is_power_of_two_or_zero(D)) {
 443       unsigned log2_D = util_logbase2(D);
 444       assert(log2_D < 32);
 445       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 446        * the top 32 bits of the result.
 447        */
 448       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 449       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 450       emit_lri32(batch, CS_GPR(0) + 4, 0);
 451    } else {
 452       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 453       assert(m.multiplier <= UINT32_MAX);
 454
 455       if (m.pre_shift) {
 456          /* We right-shift by L by left-shifting by 32 - l and taking the top
 457           * 32 bits of the result.
 458           */
 459          if (m.pre_shift < 32)
 460             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 461          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 462          emit_lri32(batch, CS_GPR(0) + 4, 0);
 463       }
 464
 465       /* Do the 32x32 multiply into gpr0 */
 466       emit_mul_gpr0(batch, m.multiplier);
 467
 468       if (m.increment) {
 469          /* If we need to increment, save off a copy of GPR0 */
 470          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 471          emit_lri32(batch, CS_GPR(1) + 4, 0);
 472          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 473       }
 474
 475       /* Shift by 32 */
 476       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 477       emit_lri32(batch, CS_GPR(0) + 4, 0);
 478
 479       if (m.post_shift) {
 480          /* We right-shift by L by left-shifting by 32 - l and taking the top
 481           * 32 bits of the result.
 482           */
 483          if (m.post_shift < 32)
 484             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 485          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 486          emit_lri32(batch, CS_GPR(0) + 4, 0);
 487       }
 488    }
 489 }
 490
 491 void
 492 iris_math_add32_gpr0(struct iris_context *ice,
 493                      struct iris_batch *batch,
 494                      uint32_t x)
 495 {
 496    emit_lri32(batch, CS_GPR(1), x);
 497    emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 498 }
 499
 500 /*
 501  * GPR0 = (GPR0 == 0) ? 0 : 1;
 502  */
 503 static void
 504 gpr0_to_bool(struct iris_context *ice)
 505 {
 506    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 507
 508    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 509
 510    static const uint32_t math[] = {
 511       MI_MATH | (9 - 2),
 512       MI_ALU2(LOAD, SRCA, R0),
 513       MI_ALU1(LOAD0, SRCB),
 514       MI_ALU0(ADD),
 515       MI_ALU2(STOREINV, R0, ZF),
 516       MI_ALU2(LOAD, SRCA, R0),
 517       MI_ALU2(LOAD, SRCB, R1),
 518       MI_ALU0(AND),
 519       MI_ALU2(STORE, R0, ACCU),
 520    };
 521    iris_batch_emit(batch, math, sizeof(math));
 522 }
 523
 524 static void
 525 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 526                               struct iris_query *q,
 527                               int idx)
 528 {
 529    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 530    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 531    uint32_t offset = q->query_state_ref.offset;
 532
 533    ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
 534                                  offsetof(struct iris_query_so_overflow,
 535                                           stream[idx].prim_storage_needed[0]));
 536    ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
 537                                  offsetof(struct iris_query_so_overflow,
 538                                           stream[idx].prim_storage_needed[1]));
 539
 540    ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
 541                                  offsetof(struct iris_query_so_overflow,
 542                                           stream[idx].num_prims[0]));
 543    ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
 544                                  offsetof(struct iris_query_so_overflow,
 545                                           stream[idx].num_prims[1]));
 546 }
 547
 548 /*
 549  * R3 = R4 - R3;
 550  * R1 = R2 - R1;
 551  * R1 = R3 - R1;
 552  * R0 = R0 | R1;
 553  */
 554 static void
 555 calc_overflow_for_stream(struct iris_context *ice)
 556 {
 557    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 558    static const uint32_t maths[] = {
 559       MI_MATH | (17 - 2),
 560       MI_ALU2(LOAD, SRCA, R4),
 561       MI_ALU2(LOAD, SRCB, R3),
 562       MI_ALU0(SUB),
 563       MI_ALU2(STORE, R3, ACCU),
 564       MI_ALU2(LOAD, SRCA, R2),
 565       MI_ALU2(LOAD, SRCB, R1),
 566       MI_ALU0(SUB),
 567       MI_ALU2(STORE, R1, ACCU),
 568       MI_ALU2(LOAD, SRCA, R3),
 569       MI_ALU2(LOAD, SRCB, R1),
 570       MI_ALU0(SUB),
 571       MI_ALU2(STORE, R1, ACCU),
 572       MI_ALU2(LOAD, SRCA, R1),
 573       MI_ALU2(LOAD, SRCB, R0),
 574       MI_ALU0(OR),
 575       MI_ALU2(STORE, R0, ACCU),
 576    };
 577
 578    iris_batch_emit(batch, maths, sizeof(maths));
 579 }
 580
 581 static void
 582 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 583 {
 584    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 585
 586    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 587
 588    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 589       load_overflow_data_to_cs_gprs(ice, q, q->index);
 590       calc_overflow_for_stream(ice);
 591    } else {
 592       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 593          load_overflow_data_to_cs_gprs(ice, q, i);
 594          calc_overflow_for_stream(ice);
 595       }
 596    }
 597
 598    gpr0_to_bool(ice);
 599 }
 600
 601 /*
 602  * GPR0 = GPR0 & ((1ull << n) -1);
 603  */
 604 static void
 605 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
 606 {
 607    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 608
 609    ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 610    static const uint32_t math[] = {
 611       MI_MATH | (5 - 2),
 612       MI_ALU2(LOAD, SRCA, R0),
 613       MI_ALU2(LOAD, SRCB, R1),
 614       MI_ALU0(AND),
 615       MI_ALU2(STORE, R0, ACCU),
 616    };
 617    iris_batch_emit(batch, math, sizeof(math));
 618 }
 619
 620 /*
 621  * GPR0 = GPR0 << 30;
 622  */
 623 static void
 624 shl_gpr0_by_30_bits(struct iris_context *ice)
 625 {
 626    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 627    /* First we mask 34 bits of GPR0 to prevent overflow */
 628    keep_gpr0_lower_n_bits(ice, 34);
 629
 630    static const uint32_t shl_math[] = {
 631       MI_ALU2(LOAD, SRCA, R0),
 632       MI_ALU2(LOAD, SRCB, R0),
 633       MI_ALU0(ADD),
 634       MI_ALU2(STORE, R0, ACCU),
 635    };
 636
 637    const uint32_t outer_count = 5;
 638    const uint32_t inner_count = 6;
 639    const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
 640    const uint32_t batch_len = cmd_len * outer_count;
 641    uint32_t *map = iris_get_command_space(batch, batch_len * 4);
 642    uint32_t offset = 0;
 643    for (int o = 0; o < outer_count; o++) {
 644       map[offset++] = MI_MATH | (cmd_len - 2);
 645       for (int i = 0; i < inner_count; i++) {
 646          memcpy(&map[offset], shl_math, sizeof(shl_math));
 647          offset += 4;
 648       }
 649    }
 650 }
 651
 652 /*
 653  * GPR0 = GPR0 >> 2;
 654  *
 655  * Note that the upper 30 bits of GPR0 are lost!
 656  */
 657 static void
 658 shr_gpr0_by_2_bits(struct iris_context *ice)
 659 {
 660    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 661    shl_gpr0_by_30_bits(ice);
 662    ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 663    ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
 664 }
 665
 666 /**
 667  * Calculate the result and store it to CS_GPR0.
 668  */
 669 static void
 670 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 671 {
 672    struct iris_batch *batch = &ice->batches[q->batch_idx];
 673    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 674    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 675    uint32_t offset = q->query_state_ref.offset;
 676
 677    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 678        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 679       overflow_result_to_gpr0(ice, q);
 680       return;
 681    }
 682
 683    if (q->type == PIPE_QUERY_TIMESTAMP) {
 684       ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
 685                                     offset +
 686                                     offsetof(struct iris_query_snapshots, start));
 687       /* TODO: This discards any fractional bits of the timebase scale.
 688        * We would need to do a bit of fixed point math on the CS ALU, or
 689        * launch an actual shader to calculate this with full precision.
 690        */
 691       emit_mul_gpr0(batch, (1000000000ull / devinfo->timestamp_frequency));
 692       keep_gpr0_lower_n_bits(ice, 36);
 693       return;
 694    }
 695
 696    ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
 697                                  offset +
 698                                  offsetof(struct iris_query_snapshots, start));
 699    ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
 700                                  offset +
 701                                  offsetof(struct iris_query_snapshots, end));
 702
 703    static const uint32_t math[] = {
 704       MI_MATH | (5 - 2),
 705       MI_ALU2(LOAD, SRCA, R2),
 706       MI_ALU2(LOAD, SRCB, R1),
 707       MI_ALU0(SUB),
 708       MI_ALU2(STORE, R0, ACCU),
 709    };
 710    iris_batch_emit(batch, math, sizeof(math));
 711
 712    /* WaDividePSInvocationCountBy4:HSW,BDW */
 713    if (devinfo->gen == 8 &&
 714        q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 715        q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 716       shr_gpr0_by_2_bits(ice);
 717
 718    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 719        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 720       gpr0_to_bool(ice);
 721
 722    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
 723       /* TODO: This discards fractional bits (see above). */
 724       emit_mul_gpr0(batch, (1000000000ull / devinfo->timestamp_frequency));
 725    }
 726 }
 727
 728 static struct pipe_query *
 729 iris_create_query(struct pipe_context *ctx,
 730                   unsigned query_type,
 731                   unsigned index)
 732 {
 733    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 734
 735    q->type = query_type;
 736    q->index = index;
 737
 738    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 739        q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
 740       q->batch_idx = IRIS_BATCH_COMPUTE;
 741    else
 742       q->batch_idx = IRIS_BATCH_RENDER;
 743    return (struct pipe_query *) q;
 744 }
 745
 746 static void
 747 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 748 {
 749    struct iris_query *query = (void *) p_query;
 750    struct iris_screen *screen = (void *) ctx->screen;
 751    iris_syncpt_reference(screen, &query->syncpt, NULL);
 752    free(query);
 753 }
 754
 755
 756 static boolean
 757 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 758 {
 759    struct iris_context *ice = (void *) ctx;
 760    struct iris_query *q = (void *) query;
 761    void *ptr = NULL;
 762    uint32_t size;
 763
 764    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 765        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 766       size = sizeof(struct iris_query_so_overflow);
 767    else
 768       size = sizeof(struct iris_query_snapshots);
 769
 770    u_upload_alloc(ice->query_buffer_uploader, 0,
 771                   size, size, &q->query_state_ref.offset,
 772                   &q->query_state_ref.res, &ptr);
 773
 774    if (!iris_resource_bo(q->query_state_ref.res))
 775       return false;
 776
 777    q->map = ptr;
 778    if (!q->map)
 779       return false;
 780
 781    q->result = 0ull;
 782    q->ready = false;
 783    WRITE_ONCE(q->map->snapshots_landed, false);
 784
 785    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 786       ice->state.prims_generated_query_active = true;
 787       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 788    }
 789
 790    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 791        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 792       write_overflow_values(ice, q, false);
 793    else
 794       write_value(ice, q,
 795                   q->query_state_ref.offset +
 796                   offsetof(struct iris_query_snapshots, start));
 797
 798    return true;
 799 }
 800
 801 static bool
 802 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 803 {
 804    struct iris_context *ice = (void *) ctx;
 805    struct iris_query *q = (void *) query;
 806    struct iris_batch *batch = &ice->batches[q->batch_idx];
 807
 808    if (q->type == PIPE_QUERY_TIMESTAMP) {
 809       iris_begin_query(ctx, query);
 810       iris_batch_reference_signal_syncpt(batch, &q->syncpt);
 811       mark_available(ice, q);
 812       return true;
 813    }
 814
 815    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 816       ice->state.prims_generated_query_active = false;
 817       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 818    }
 819
 820    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 821        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 822       write_overflow_values(ice, q, true);
 823    else
 824       write_value(ice, q,
 825                   q->query_state_ref.offset +
 826                   offsetof(struct iris_query_snapshots, end));
 827
 828    iris_batch_reference_signal_syncpt(batch, &q->syncpt);
 829    mark_available(ice, q);
 830
 831    return true;
 832 }
 833
 834 /**
 835  * See if the snapshots have landed for a query, and if so, compute the
 836  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 837  */
 838 static void
 839 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 840 {
 841    struct iris_screen *screen = (void *) ice->ctx.screen;
 842    const struct gen_device_info *devinfo = &screen->devinfo;
 843
 844    if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
 845       calculate_result_on_cpu(devinfo, q);
 846    }
 847 }
 848
 849 static boolean
 850 iris_get_query_result(struct pipe_context *ctx,
 851                       struct pipe_query *query,
 852                       boolean wait,
 853                       union pipe_query_result *result)
 854 {
 855    struct iris_context *ice = (void *) ctx;
 856    struct iris_query *q = (void *) query;
 857    struct iris_screen *screen = (void *) ctx->screen;
 858    const struct gen_device_info *devinfo = &screen->devinfo;
 859
 860    if (unlikely(screen->no_hw)) {
 861       result->u64 = 0;
 862       return true;
 863    }
 864
 865    if (!q->ready) {
 866       struct iris_batch *batch = &ice->batches[q->batch_idx];
 867       if (q->syncpt == iris_batch_get_signal_syncpt(batch))
 868          iris_batch_flush(batch);
 869
 870       while (!READ_ONCE(q->map->snapshots_landed)) {
 871          if (wait)
 872             iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
 873          else
 874             return false;
 875       }
 876
 877       assert(READ_ONCE(q->map->snapshots_landed));
 878       calculate_result_on_cpu(devinfo, q);
 879    }
 880
 881    assert(q->ready);
 882
 883    result->u64 = q->result;
 884
 885    return true;
 886 }
 887
 888 static void
 889 iris_get_query_result_resource(struct pipe_context *ctx,
 890                                struct pipe_query *query,
 891                                boolean wait,
 892                                enum pipe_query_value_type result_type,
 893                                int index,
 894                                struct pipe_resource *p_res,
 895                                unsigned offset)
 896 {
 897    struct iris_context *ice = (void *) ctx;
 898    struct iris_query *q = (void *) query;
 899    struct iris_batch *batch = &ice->batches[q->batch_idx];
 900    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 901    struct iris_resource *res = (void *) p_res;
 902    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 903    unsigned snapshots_landed_offset =
 904       offsetof(struct iris_query_snapshots, snapshots_landed);
 905
 906    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 907
 908    if (index == -1) {
 909       /* They're asking for the availability of the result.  If we still
 910        * have commands queued up which produce the result, submit them
 911        * now so that progress happens.  Either way, copy the snapshots
 912        * landed field to the destination resource.
 913        */
 914       if (q->syncpt == iris_batch_get_signal_syncpt(batch))
 915          iris_batch_flush(batch);
 916
 917       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 918                              bo, snapshots_landed_offset,
 919                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 920       return;
 921    }
 922
 923    if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
 924       /* The final snapshots happen to have landed, so let's just compute
 925        * the result on the CPU now...
 926        */
 927       calculate_result_on_cpu(devinfo, q);
 928    }
 929
 930    if (q->ready) {
 931       /* We happen to have the result on the CPU, so just copy it. */
 932       if (result_type <= PIPE_QUERY_TYPE_U32) {
 933          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 934                                     q->result);
 935       } else {
 936          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 937                                     q->result);
 938       }
 939
 940       /* Make sure the result lands before they use bind the QBO elsewhere
 941        * and use the result.
 942        */
 943       // XXX: Why?  i965 doesn't do this.
 944       iris_emit_pipe_control_flush(batch,
 945                                    "query: unknown QBO flushing hack",
 946                                    PIPE_CONTROL_CS_STALL);
 947       return;
 948    }
 949
 950    /* Calculate the result to CS_GPR0 */
 951    calculate_result_on_gpu(ice, q);
 952
 953    bool predicated = !wait && !q->stalled;
 954
 955    if (predicated) {
 956       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 957       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
 958                                     snapshots_landed_offset);
 959       uint32_t predicate = MI_PREDICATE |
 960                            MI_PREDICATE_LOADOP_LOADINV |
 961                            MI_PREDICATE_COMBINEOP_SET |
 962                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 963       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 964    }
 965
 966    if (result_type <= PIPE_QUERY_TYPE_U32) {
 967       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 968                                      iris_resource_bo(p_res),
 969                                      offset, predicated);
 970    } else {
 971       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 972                                      iris_resource_bo(p_res),
 973                                      offset, predicated);
 974    }
 975 }
 976
 977 static void
 978 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 979 {
 980    struct iris_context *ice = (void *) ctx;
 981
 982    if (ice->state.statistics_counters_enabled == enable)
 983       return;
 984
 985    // XXX: most packets aren't paying attention to this yet, because it'd
 986    // have to be done dynamically at draw time, which is a pain
 987    ice->state.statistics_counters_enabled = enable;
 988    ice->state.dirty |= IRIS_DIRTY_CLIP |
 989                        IRIS_DIRTY_GS |
 990                        IRIS_DIRTY_RASTER |
 991                        IRIS_DIRTY_STREAMOUT |
 992                        IRIS_DIRTY_TCS |
 993                        IRIS_DIRTY_TES |
 994                        IRIS_DIRTY_VS |
 995                        IRIS_DIRTY_WM;
 996 }
 997
 998 static void
 999 set_predicate_enable(struct iris_context *ice, bool value)
1000 {
1001    if (value)
1002       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1003    else
1004       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1005 }
1006
1007 static void
1008 set_predicate_for_result(struct iris_context *ice,
1009                          struct iris_query *q,
1010                          bool inverted)
1011 {
1012    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1013    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1014
1015    /* The CPU doesn't have the query result yet; use hardware predication */
1016    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1017
1018    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1019    iris_emit_pipe_control_flush(batch,
1020                                 "conditional rendering: set predicate",
1021                                 PIPE_CONTROL_FLUSH_ENABLE);
1022    q->stalled = true;
1023
1024    switch (q->type) {
1025    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1026    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1027       overflow_result_to_gpr0(ice, q);
1028
1029       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1030       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1031       break;
1032    default:
1033       /* PIPE_QUERY_OCCLUSION_* */
1034       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1035          offsetof(struct iris_query_snapshots, start) +
1036          q->query_state_ref.offset);
1037       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1038          offsetof(struct iris_query_snapshots, end) +
1039          q->query_state_ref.offset);
1040       break;
1041    }
1042
1043    uint32_t mi_predicate = MI_PREDICATE |
1044                            MI_PREDICATE_COMBINEOP_SET |
1045                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1046                            (inverted ? MI_PREDICATE_LOADOP_LOAD
1047                                      : MI_PREDICATE_LOADOP_LOADINV);
1048    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1049
1050    /* We immediately set the predicate on the render batch, as all the
1051     * counters come from 3D operations.  However, we may need to predicate
1052     * a compute dispatch, which executes in a different GEM context and has
1053     * a different MI_PREDICATE_RESULT register.  So, we save the result to
1054     * memory and reload it in iris_launch_grid.
1055     */
1056    unsigned offset = q->query_state_ref.offset +
1057                      offsetof(struct iris_query_snapshots, predicate_result);
1058    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_RESULT,
1059                                   bo, offset, false);
1060    ice->state.compute_predicate = bo;
1061 }
1062
1063 static void
1064 iris_render_condition(struct pipe_context *ctx,
1065                       struct pipe_query *query,
1066                       boolean condition,
1067                       enum pipe_render_cond_flag mode)
1068 {
1069    struct iris_context *ice = (void *) ctx;
1070    struct iris_query *q = (void *) query;
1071
1072    /* The old condition isn't relevant; we'll update it if necessary */
1073    ice->state.compute_predicate = NULL;
1074    ice->condition.query = q;
1075    ice->condition.condition = condition;
1076
1077    if (!q) {
1078       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1079       return;
1080    }
1081
1082    iris_check_query_no_flush(ice, q);
1083
1084    if (q->result || q->ready) {
1085       set_predicate_enable(ice, (q->result != 0) ^ condition);
1086    } else {
1087       if (mode == PIPE_RENDER_COND_NO_WAIT ||
1088           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1089          perf_debug(&ice->dbg, "Conditional rendering demoted from "
1090                     "\"no wait\" to \"wait\".");
1091       }
1092       set_predicate_for_result(ice, q, condition);
1093    }
1094 }
1095
1096 void
1097 iris_resolve_conditional_render(struct iris_context *ice)
1098 {
1099    struct pipe_context *ctx = (void *) ice;
1100    struct iris_query *q = ice->condition.query;
1101    struct pipe_query *query = (void *) q;
1102    union pipe_query_result result;
1103
1104    if (ice->state.predicate != IRIS_PREDICATE_STATE_USE_BIT)
1105       return;
1106
1107    assert(q);
1108
1109    iris_get_query_result(ctx, query, true, &result);
1110    set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
1111 }
1112
1113 void
1114 iris_init_query_functions(struct pipe_context *ctx)
1115 {
1116    ctx->create_query = iris_create_query;
1117    ctx->destroy_query = iris_destroy_query;
1118    ctx->begin_query = iris_begin_query;
1119    ctx->end_query = iris_end_query;
1120    ctx->get_query_result = iris_get_query_result;
1121    ctx->get_query_result_resource = iris_get_query_result_resource;
1122    ctx->set_active_query_state = iris_set_active_query_state;
1123    ctx->render_condition = iris_render_condition;
1124 }