src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "util/u_upload_mgr.h"
  39 #include "iris_context.h"
  40 #include "iris_defines.h"
  41 #include "iris_fence.h"
  42 #include "iris_resource.h"
  43 #include "iris_screen.h"
  44 #include "vulkan/util/vk_util.h"
  45
  46 #define IA_VERTICES_COUNT          0x2310
  47 #define IA_PRIMITIVES_COUNT        0x2318
  48 #define VS_INVOCATION_COUNT        0x2320
  49 #define HS_INVOCATION_COUNT        0x2300
  50 #define DS_INVOCATION_COUNT        0x2308
  51 #define GS_INVOCATION_COUNT        0x2328
  52 #define GS_PRIMITIVES_COUNT        0x2330
  53 #define CL_INVOCATION_COUNT        0x2338
  54 #define CL_PRIMITIVES_COUNT        0x2340
  55 #define PS_INVOCATION_COUNT        0x2348
  56 #define CS_INVOCATION_COUNT        0x2290
  57 #define PS_DEPTH_COUNT             0x2350
  58
  59 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  60
  61 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  62
  63 #define MI_MATH (0x1a << 23)
  64
  65 #define MI_ALU_LOAD      0x080
  66 #define MI_ALU_LOADINV   0x480
  67 #define MI_ALU_LOAD0     0x081
  68 #define MI_ALU_LOAD1     0x481
  69 #define MI_ALU_ADD       0x100
  70 #define MI_ALU_SUB       0x101
  71 #define MI_ALU_AND       0x102
  72 #define MI_ALU_OR        0x103
  73 #define MI_ALU_XOR       0x104
  74 #define MI_ALU_STORE     0x180
  75 #define MI_ALU_STOREINV  0x580
  76
  77 #define MI_ALU_SRCA      0x20
  78 #define MI_ALU_SRCB      0x21
  79 #define MI_ALU_ACCU      0x31
  80 #define MI_ALU_ZF        0x32
  81 #define MI_ALU_CF        0x33
  82
  83 #define emit_lri32 ice->vtbl.load_register_imm32
  84 #define emit_lri64 ice->vtbl.load_register_imm64
  85 #define emit_lrr32 ice->vtbl.load_register_reg32
  86
  87 struct iris_query {
  88    enum pipe_query_type type;
  89    int index;
  90
  91    bool ready;
  92
  93    bool stalled;
  94
  95    uint64_t result;
  96
  97    struct iris_state_ref query_state_ref;
  98    struct iris_query_snapshots *map;
  99    struct iris_syncpt *syncpt;
 100
 101    int batch_idx;
 102 };
 103
 104 struct iris_query_snapshots {
 105    /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
 106    uint64_t predicate_result;
 107
 108    /** Have the start/end snapshots landed? */
 109    uint64_t snapshots_landed;
 110
 111    /** Starting and ending counter snapshots */
 112    uint64_t start;
 113    uint64_t end;
 114 };
 115
 116 struct iris_query_so_overflow {
 117    uint64_t predicate_result;
 118    uint64_t snapshots_landed;
 119
 120    struct {
 121       uint64_t prim_storage_needed[2];
 122       uint64_t num_prims[2];
 123    } stream[4];
 124 };
 125
 126 /**
 127  * Is this type of query written by PIPE_CONTROL?
 128  */
 129 static bool
 130 iris_is_query_pipelined(struct iris_query *q)
 131 {
 132    switch (q->type) {
 133    case PIPE_QUERY_OCCLUSION_COUNTER:
 134    case PIPE_QUERY_OCCLUSION_PREDICATE:
 135    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 136    case PIPE_QUERY_TIMESTAMP:
 137    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 138    case PIPE_QUERY_TIME_ELAPSED:
 139       return true;
 140
 141    default:
 142       return false;
 143    }
 144 }
 145
 146 static void
 147 mark_available(struct iris_context *ice, struct iris_query *q)
 148 {
 149    struct iris_batch *batch = &ice->batches[q->batch_idx];
 150    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 151    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 152    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 153    offset += q->query_state_ref.offset;
 154
 155    if (!iris_is_query_pipelined(q)) {
 156       ice->vtbl.store_data_imm64(batch, bo, offset, true);
 157    } else {
 158       /* Order available *after* the query results. */
 159       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 160       iris_emit_pipe_control_write(batch, flags, bo, offset, true);
 161    }
 162 }
 163
 164 /**
 165  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 166  */
 167 static void
 168 iris_pipelined_write(struct iris_batch *batch,
 169                      struct iris_query *q,
 170                      enum pipe_control_flags flags,
 171                      unsigned offset)
 172 {
 173    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 174    const unsigned optional_cs_stall =
 175       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 176    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 177
 178    iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
 179                                 bo, offset, 0ull);
 180 }
 181
 182 static void
 183 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 184 {
 185    struct iris_batch *batch = &ice->batches[q->batch_idx];
 186    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 187    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 188
 189    if (!iris_is_query_pipelined(q)) {
 190       iris_emit_pipe_control_flush(batch,
 191                                    PIPE_CONTROL_CS_STALL |
 192                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 193       q->stalled = true;
 194    }
 195
 196    switch (q->type) {
 197    case PIPE_QUERY_OCCLUSION_COUNTER:
 198    case PIPE_QUERY_OCCLUSION_PREDICATE:
 199    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 200       if (devinfo->gen >= 10) {
 201          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 202           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 203           *  Count sync operation."
 204           */
 205          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
 206       }
 207       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 208                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 209                            PIPE_CONTROL_DEPTH_STALL,
 210                            offset);
 211       break;
 212    case PIPE_QUERY_TIME_ELAPSED:
 213    case PIPE_QUERY_TIMESTAMP:
 214    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 215       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 216                            PIPE_CONTROL_WRITE_TIMESTAMP,
 217                            offset);
 218       break;
 219    case PIPE_QUERY_PRIMITIVES_GENERATED:
 220       ice->vtbl.store_register_mem64(batch,
 221                                      q->index == 0 ? CL_INVOCATION_COUNT :
 222                                      SO_PRIM_STORAGE_NEEDED(q->index),
 223                                      bo, offset, false);
 224       break;
 225    case PIPE_QUERY_PRIMITIVES_EMITTED:
 226       ice->vtbl.store_register_mem64(batch,
 227                                      SO_NUM_PRIMS_WRITTEN(q->index),
 228                                      bo, offset, false);
 229       break;
 230    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
 231       static const uint32_t index_to_reg[] = {
 232          IA_VERTICES_COUNT,
 233          IA_PRIMITIVES_COUNT,
 234          VS_INVOCATION_COUNT,
 235          GS_INVOCATION_COUNT,
 236          GS_PRIMITIVES_COUNT,
 237          CL_INVOCATION_COUNT,
 238          CL_PRIMITIVES_COUNT,
 239          PS_INVOCATION_COUNT,
 240          HS_INVOCATION_COUNT,
 241          DS_INVOCATION_COUNT,
 242          CS_INVOCATION_COUNT,
 243       };
 244       const uint32_t reg = index_to_reg[q->index];
 245
 246       ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
 247       break;
 248    }
 249    default:
 250       assert(false);
 251    }
 252 }
 253
 254 static void
 255 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 256 {
 257    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 258    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 259    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 260    uint32_t offset = q->query_state_ref.offset;
 261
 262    iris_emit_pipe_control_flush(batch,
 263                                 PIPE_CONTROL_CS_STALL |
 264                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 265    for (uint32_t i = 0; i < count; i++) {
 266       int s = q->index + i;
 267       int g_idx = offset + offsetof(struct iris_query_so_overflow,
 268                            stream[s].num_prims[end]);
 269       int w_idx = offset + offsetof(struct iris_query_so_overflow,
 270                            stream[s].prim_storage_needed[end]);
 271       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 272                                      bo, g_idx, false);
 273       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 274                                      bo, w_idx, false);
 275    }
 276 }
 277
 278 uint64_t
 279 iris_timebase_scale(const struct gen_device_info *devinfo,
 280                     uint64_t gpu_timestamp)
 281 {
 282    return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 283 }
 284
 285 static uint64_t
 286 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 287 {
 288    if (time0 > time1) {
 289       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 290    } else {
 291       return time1 - time0;
 292    }
 293 }
 294
 295 static bool
 296 stream_overflowed(struct iris_query_so_overflow *so, int s)
 297 {
 298    return (so->stream[s].prim_storage_needed[1] -
 299            so->stream[s].prim_storage_needed[0]) !=
 300           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 301 }
 302
 303 static void
 304 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 305                         struct iris_query *q)
 306 {
 307    switch (q->type) {
 308    case PIPE_QUERY_OCCLUSION_PREDICATE:
 309    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 310       q->result = q->map->end != q->map->start;
 311       break;
 312    case PIPE_QUERY_TIMESTAMP:
 313    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 314       /* The timestamp is the single starting snapshot. */
 315       q->result = iris_timebase_scale(devinfo, q->map->start);
 316       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 317       break;
 318    case PIPE_QUERY_TIME_ELAPSED:
 319       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 320       q->result = iris_timebase_scale(devinfo, q->result);
 321       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 322       break;
 323    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 324       q->result = stream_overflowed((void *) q->map, q->index);
 325       break;
 326    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 327       q->result = false;
 328       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 329          q->result |= stream_overflowed((void *) q->map, i);
 330       break;
 331    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
 332       q->result = q->map->end - q->map->start;
 333
 334       /* WaDividePSInvocationCountBy4:HSW,BDW */
 335       if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 336          q->result /= 4;
 337       break;
 338    case PIPE_QUERY_OCCLUSION_COUNTER:
 339    case PIPE_QUERY_PRIMITIVES_GENERATED:
 340    case PIPE_QUERY_PRIMITIVES_EMITTED:
 341    default:
 342       q->result = q->map->end - q->map->start;
 343       break;
 344    }
 345
 346    q->ready = true;
 347 }
 348
 349 static void
 350 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 351              unsigned reg_a, unsigned reg_b)
 352 {
 353    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 354
 355    math[0] = MI_MATH | (5 - 2);
 356    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 357    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 358    math[3] = _MI_ALU0(ADD);
 359    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 360 }
 361
 362 static void
 363 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 364              unsigned src_reg, unsigned shift)
 365 {
 366    assert(shift > 0);
 367
 368    int dwords = 1 + 4 * shift;
 369
 370    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 371
 372    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 373
 374    for (unsigned i = 0; i < shift; i++) {
 375       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 376       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 377       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 378       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 379       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 380    }
 381 }
 382
 383 /* Emit dwords to multiply GPR0 by N */
 384 static void
 385 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 386 {
 387    VK_OUTARRAY_MAKE(out, dw, dw_count);
 388
 389 #define APPEND_ALU(op, x, y) \
 390    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 391
 392    assert(N > 0);
 393    unsigned top_bit = 31 - __builtin_clz(N);
 394    for (int i = top_bit - 1; i >= 0; i--) {
 395       /* We get our initial data in GPR0 and we write the final data out to
 396        * GPR0 but we use GPR1 as our scratch register.
 397        */
 398       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 399       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 400
 401       /* Shift the current value left by 1 */
 402       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 403       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 404       APPEND_ALU(ADD, 0, 0);
 405
 406       if (N & (1 << i)) {
 407          /* Store ACCU to R1 and add R0 to R1 */
 408          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 409          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 410          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 411          APPEND_ALU(ADD, 0, 0);
 412       }
 413
 414       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 415    }
 416
 417 #undef APPEND_ALU
 418 }
 419
 420 static void
 421 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 422 {
 423    uint32_t num_dwords;
 424    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 425
 426    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 427    math[0] = MI_MATH | (num_dwords - 2);
 428    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 429 }
 430
 431 void
 432 iris_math_div32_gpr0(struct iris_context *ice,
 433                      struct iris_batch *batch,
 434                      uint32_t D)
 435 {
 436    /* Zero out the top of GPR0 */
 437    emit_lri32(batch, CS_GPR(0) + 4, 0);
 438
 439    if (D == 0) {
 440       /* This invalid, but we should do something so we set GPR0 to 0. */
 441       emit_lri32(batch, CS_GPR(0), 0);
 442    } else if (util_is_power_of_two_or_zero(D)) {
 443       unsigned log2_D = util_logbase2(D);
 444       assert(log2_D < 32);
 445       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 446        * the top 32 bits of the result.
 447        */
 448       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 449       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 450       emit_lri32(batch, CS_GPR(0) + 4, 0);
 451    } else {
 452       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 453       assert(m.multiplier <= UINT32_MAX);
 454
 455       if (m.pre_shift) {
 456          /* We right-shift by L by left-shifting by 32 - l and taking the top
 457           * 32 bits of the result.
 458           */
 459          if (m.pre_shift < 32)
 460             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 461          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 462          emit_lri32(batch, CS_GPR(0) + 4, 0);
 463       }
 464
 465       /* Do the 32x32 multiply into gpr0 */
 466       emit_mul_gpr0(batch, m.multiplier);
 467
 468       if (m.increment) {
 469          /* If we need to increment, save off a copy of GPR0 */
 470          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 471          emit_lri32(batch, CS_GPR(1) + 4, 0);
 472          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 473       }
 474
 475       /* Shift by 32 */
 476       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 477       emit_lri32(batch, CS_GPR(0) + 4, 0);
 478
 479       if (m.post_shift) {
 480          /* We right-shift by L by left-shifting by 32 - l and taking the top
 481           * 32 bits of the result.
 482           */
 483          if (m.post_shift < 32)
 484             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 485          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 486          emit_lri32(batch, CS_GPR(0) + 4, 0);
 487       }
 488    }
 489 }
 490
 491 void
 492 iris_math_add32_gpr0(struct iris_context *ice,
 493                      struct iris_batch *batch,
 494                      uint32_t x)
 495 {
 496    emit_lri32(batch, CS_GPR(1), x);
 497    emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 498 }
 499
 500 /*
 501  * GPR0 = (GPR0 == 0) ? 0 : 1;
 502  */
 503 static void
 504 gpr0_to_bool(struct iris_context *ice)
 505 {
 506    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 507
 508    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 509
 510    static const uint32_t math[] = {
 511       MI_MATH | (9 - 2),
 512       MI_ALU2(LOAD, SRCA, R0),
 513       MI_ALU1(LOAD0, SRCB),
 514       MI_ALU0(ADD),
 515       MI_ALU2(STOREINV, R0, ZF),
 516       MI_ALU2(LOAD, SRCA, R0),
 517       MI_ALU2(LOAD, SRCB, R1),
 518       MI_ALU0(AND),
 519       MI_ALU2(STORE, R0, ACCU),
 520    };
 521    iris_batch_emit(batch, math, sizeof(math));
 522 }
 523
 524 static void
 525 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 526                               struct iris_query *q,
 527                               int idx)
 528 {
 529    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 530    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 531    uint32_t offset = q->query_state_ref.offset;
 532
 533    ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset +
 534                                  offsetof(struct iris_query_so_overflow,
 535                                           stream[idx].prim_storage_needed[0]));
 536    ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset +
 537                                  offsetof(struct iris_query_so_overflow,
 538                                           stream[idx].prim_storage_needed[1]));
 539
 540    ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset +
 541                                  offsetof(struct iris_query_so_overflow,
 542                                           stream[idx].num_prims[0]));
 543    ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset +
 544                                  offsetof(struct iris_query_so_overflow,
 545                                           stream[idx].num_prims[1]));
 546 }
 547
 548 /*
 549  * R3 = R4 - R3;
 550  * R1 = R2 - R1;
 551  * R1 = R3 - R1;
 552  * R0 = R0 | R1;
 553  */
 554 static void
 555 calc_overflow_for_stream(struct iris_context *ice)
 556 {
 557    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 558    static const uint32_t maths[] = {
 559       MI_MATH | (17 - 2),
 560       MI_ALU2(LOAD, SRCA, R4),
 561       MI_ALU2(LOAD, SRCB, R3),
 562       MI_ALU0(SUB),
 563       MI_ALU2(STORE, R3, ACCU),
 564       MI_ALU2(LOAD, SRCA, R2),
 565       MI_ALU2(LOAD, SRCB, R1),
 566       MI_ALU0(SUB),
 567       MI_ALU2(STORE, R1, ACCU),
 568       MI_ALU2(LOAD, SRCA, R3),
 569       MI_ALU2(LOAD, SRCB, R1),
 570       MI_ALU0(SUB),
 571       MI_ALU2(STORE, R1, ACCU),
 572       MI_ALU2(LOAD, SRCA, R1),
 573       MI_ALU2(LOAD, SRCB, R0),
 574       MI_ALU0(OR),
 575       MI_ALU2(STORE, R0, ACCU),
 576    };
 577
 578    iris_batch_emit(batch, maths, sizeof(maths));
 579 }
 580
 581 static void
 582 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 583 {
 584    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 585
 586    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 587
 588    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 589       load_overflow_data_to_cs_gprs(ice, q, q->index);
 590       calc_overflow_for_stream(ice);
 591    } else {
 592       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 593          load_overflow_data_to_cs_gprs(ice, q, i);
 594          calc_overflow_for_stream(ice);
 595       }
 596    }
 597
 598    gpr0_to_bool(ice);
 599 }
 600
 601 /*
 602  * GPR0 = GPR0 & ((1ull << n) -1);
 603  */
 604 static void
 605 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
 606 {
 607    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 608
 609    ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 610    static const uint32_t math[] = {
 611       MI_MATH | (5 - 2),
 612       MI_ALU2(LOAD, SRCA, R0),
 613       MI_ALU2(LOAD, SRCB, R1),
 614       MI_ALU0(AND),
 615       MI_ALU2(STORE, R0, ACCU),
 616    };
 617    iris_batch_emit(batch, math, sizeof(math));
 618 }
 619
 620 /*
 621  * GPR0 = GPR0 << 30;
 622  */
 623 static void
 624 shl_gpr0_by_30_bits(struct iris_context *ice)
 625 {
 626    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 627    /* First we mask 34 bits of GPR0 to prevent overflow */
 628    keep_gpr0_lower_n_bits(ice, 34);
 629
 630    static const uint32_t shl_math[] = {
 631       MI_ALU2(LOAD, SRCA, R0),
 632       MI_ALU2(LOAD, SRCB, R0),
 633       MI_ALU0(ADD),
 634       MI_ALU2(STORE, R0, ACCU),
 635    };
 636
 637    const uint32_t outer_count = 5;
 638    const uint32_t inner_count = 6;
 639    const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math);
 640    const uint32_t batch_len = cmd_len * outer_count;
 641    uint32_t *map = iris_get_command_space(batch, batch_len * 4);
 642    uint32_t offset = 0;
 643    for (int o = 0; o < outer_count; o++) {
 644       map[offset++] = MI_MATH | (cmd_len - 2);
 645       for (int i = 0; i < inner_count; i++) {
 646          memcpy(&map[offset], shl_math, sizeof(shl_math));
 647          offset += 4;
 648       }
 649    }
 650 }
 651
 652 /*
 653  * GPR0 = GPR0 >> 2;
 654  *
 655  * Note that the upper 30 bits of GPR0 are lost!
 656  */
 657 static void
 658 shr_gpr0_by_2_bits(struct iris_context *ice)
 659 {
 660    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 661    shl_gpr0_by_30_bits(ice);
 662    ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 663    ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0);
 664 }
 665
 666 /**
 667  * Calculate the result and store it to CS_GPR0.
 668  */
 669 static void
 670 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 671 {
 672    struct iris_batch *batch = &ice->batches[q->batch_idx];
 673    struct iris_screen *screen = (void *) ice->ctx.screen;
 674    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 675    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 676    uint32_t offset = q->query_state_ref.offset;
 677
 678    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 679        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 680       overflow_result_to_gpr0(ice, q);
 681       return;
 682    }
 683
 684    if (q->type == PIPE_QUERY_TIMESTAMP) {
 685       ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo,
 686                                     offset +
 687                                     offsetof(struct iris_query_snapshots, start));
 688       /* TODO: This discards any fractional bits of the timebase scale.
 689        * We would need to do a bit of fixed point math on the CS ALU, or
 690        * launch an actual shader to calculate this with full precision.
 691        */
 692       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 693       keep_gpr0_lower_n_bits(ice, 36);
 694       return;
 695    }
 696
 697    ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo,
 698                                  offset +
 699                                  offsetof(struct iris_query_snapshots, start));
 700    ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo,
 701                                  offset +
 702                                  offsetof(struct iris_query_snapshots, end));
 703
 704    static const uint32_t math[] = {
 705       MI_MATH | (5 - 2),
 706       MI_ALU2(LOAD, SRCA, R2),
 707       MI_ALU2(LOAD, SRCB, R1),
 708       MI_ALU0(SUB),
 709       MI_ALU2(STORE, R0, ACCU),
 710    };
 711    iris_batch_emit(batch, math, sizeof(math));
 712
 713    /* WaDividePSInvocationCountBy4:HSW,BDW */
 714    if (devinfo->gen == 8 &&
 715        q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 716        q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
 717       shr_gpr0_by_2_bits(ice);
 718
 719    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 720        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 721       gpr0_to_bool(ice);
 722
 723    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
 724       /* TODO: This discards fractional bits (see above). */
 725       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 726    }
 727 }
 728
 729 static struct pipe_query *
 730 iris_create_query(struct pipe_context *ctx,
 731                   unsigned query_type,
 732                   unsigned index)
 733 {
 734    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 735
 736    q->type = query_type;
 737    q->index = index;
 738
 739    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
 740        q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
 741       q->batch_idx = IRIS_BATCH_COMPUTE;
 742    else
 743       q->batch_idx = IRIS_BATCH_RENDER;
 744    return (struct pipe_query *) q;
 745 }
 746
 747 static void
 748 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 749 {
 750    struct iris_query *query = (void *) p_query;
 751    struct iris_screen *screen = (void *) ctx->screen;
 752    iris_syncpt_reference(screen, &query->syncpt, NULL);
 753    free(query);
 754 }
 755
 756
 757 static boolean
 758 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 759 {
 760    struct iris_context *ice = (void *) ctx;
 761    struct iris_query *q = (void *) query;
 762    void *ptr = NULL;
 763    uint32_t size;
 764
 765    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 766        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 767       size = sizeof(struct iris_query_so_overflow);
 768    else
 769       size = sizeof(struct iris_query_snapshots);
 770
 771    u_upload_alloc(ice->query_buffer_uploader, 0,
 772                   size, size, &q->query_state_ref.offset,
 773                   &q->query_state_ref.res, &ptr);
 774
 775    if (!iris_resource_bo(q->query_state_ref.res))
 776       return false;
 777
 778    q->map = ptr;
 779    if (!q->map)
 780       return false;
 781
 782    q->result = 0ull;
 783    q->ready = false;
 784    WRITE_ONCE(q->map->snapshots_landed, false);
 785
 786    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 787       ice->state.prims_generated_query_active = true;
 788       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 789    }
 790
 791    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 792        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 793       write_overflow_values(ice, q, false);
 794    else
 795       write_value(ice, q,
 796                   q->query_state_ref.offset +
 797                   offsetof(struct iris_query_snapshots, start));
 798
 799    return true;
 800 }
 801
 802 static bool
 803 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 804 {
 805    struct iris_context *ice = (void *) ctx;
 806    struct iris_query *q = (void *) query;
 807    struct iris_batch *batch = &ice->batches[q->batch_idx];
 808
 809    if (q->type == PIPE_QUERY_TIMESTAMP) {
 810       iris_begin_query(ctx, query);
 811       iris_batch_reference_signal_syncpt(batch, &q->syncpt);
 812       mark_available(ice, q);
 813       return true;
 814    }
 815
 816    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 817       ice->state.prims_generated_query_active = false;
 818       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 819    }
 820
 821    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 822        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 823       write_overflow_values(ice, q, true);
 824    else
 825       write_value(ice, q,
 826                   q->query_state_ref.offset +
 827                   offsetof(struct iris_query_snapshots, end));
 828
 829    iris_batch_reference_signal_syncpt(batch, &q->syncpt);
 830    mark_available(ice, q);
 831
 832    return true;
 833 }
 834
 835 /**
 836  * See if the snapshots have landed for a query, and if so, compute the
 837  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 838  */
 839 static void
 840 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 841 {
 842    struct iris_screen *screen = (void *) ice->ctx.screen;
 843    const struct gen_device_info *devinfo = &screen->devinfo;
 844
 845    if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
 846       calculate_result_on_cpu(devinfo, q);
 847    }
 848 }
 849
 850 static boolean
 851 iris_get_query_result(struct pipe_context *ctx,
 852                       struct pipe_query *query,
 853                       boolean wait,
 854                       union pipe_query_result *result)
 855 {
 856    struct iris_context *ice = (void *) ctx;
 857    struct iris_query *q = (void *) query;
 858    struct iris_screen *screen = (void *) ctx->screen;
 859    const struct gen_device_info *devinfo = &screen->devinfo;
 860    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 861
 862    if (unlikely(screen->no_hw)) {
 863       result->u64 = 0;
 864       return true;
 865    }
 866
 867    if (!q->ready) {
 868       if (iris_batch_references(&ice->batches[q->batch_idx], bo))
 869          iris_batch_flush(&ice->batches[q->batch_idx]);
 870
 871       while (!READ_ONCE(q->map->snapshots_landed)) {
 872          if (wait)
 873             iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX);
 874          else
 875             return false;
 876       }
 877
 878       assert(READ_ONCE(q->map->snapshots_landed));
 879       calculate_result_on_cpu(devinfo, q);
 880    }
 881
 882    assert(q->ready);
 883
 884    result->u64 = q->result;
 885
 886    return true;
 887 }
 888
 889 static void
 890 iris_get_query_result_resource(struct pipe_context *ctx,
 891                                struct pipe_query *query,
 892                                boolean wait,
 893                                enum pipe_query_value_type result_type,
 894                                int index,
 895                                struct pipe_resource *p_res,
 896                                unsigned offset)
 897 {
 898    struct iris_context *ice = (void *) ctx;
 899    struct iris_query *q = (void *) query;
 900    struct iris_batch *batch = &ice->batches[q->batch_idx];
 901    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 902    struct iris_resource *res = (void *) p_res;
 903    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
 904    unsigned snapshots_landed_offset =
 905       offsetof(struct iris_query_snapshots, snapshots_landed);
 906
 907    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 908
 909    if (index == -1) {
 910       /* They're asking for the availability of the result.  If we still
 911        * have commands queued up which produce the result, submit them
 912        * now so that progress happens.  Either way, copy the snapshots
 913        * landed field to the destination resource.
 914        */
 915       if (iris_batch_references(batch, bo))
 916          iris_batch_flush(batch);
 917
 918       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 919                              bo, snapshots_landed_offset,
 920                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 921       return;
 922    }
 923
 924    if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
 925       /* The final snapshots happen to have landed, so let's just compute
 926        * the result on the CPU now...
 927        */
 928       calculate_result_on_cpu(devinfo, q);
 929    }
 930
 931    if (q->ready) {
 932       /* We happen to have the result on the CPU, so just copy it. */
 933       if (result_type <= PIPE_QUERY_TYPE_U32) {
 934          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 935                                     q->result);
 936       } else {
 937          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 938                                     q->result);
 939       }
 940
 941       /* Make sure the result lands before they use bind the QBO elsewhere
 942        * and use the result.
 943        */
 944       // XXX: Why?  i965 doesn't do this.
 945       iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
 946       return;
 947    }
 948
 949    /* Calculate the result to CS_GPR0 */
 950    calculate_result_on_gpu(ice, q);
 951
 952    bool predicated = !wait && !q->stalled;
 953
 954    if (predicated) {
 955       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 956       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
 957                                     snapshots_landed_offset);
 958       uint32_t predicate = MI_PREDICATE |
 959                            MI_PREDICATE_LOADOP_LOADINV |
 960                            MI_PREDICATE_COMBINEOP_SET |
 961                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 962       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 963    }
 964
 965    if (result_type <= PIPE_QUERY_TYPE_U32) {
 966       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 967                                      iris_resource_bo(p_res),
 968                                      offset, predicated);
 969    } else {
 970       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 971                                      iris_resource_bo(p_res),
 972                                      offset, predicated);
 973    }
 974 }
 975
 976 static void
 977 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 978 {
 979    struct iris_context *ice = (void *) ctx;
 980
 981    if (ice->state.statistics_counters_enabled == enable)
 982       return;
 983
 984    // XXX: most packets aren't paying attention to this yet, because it'd
 985    // have to be done dynamically at draw time, which is a pain
 986    ice->state.statistics_counters_enabled = enable;
 987    ice->state.dirty |= IRIS_DIRTY_CLIP |
 988                        IRIS_DIRTY_GS |
 989                        IRIS_DIRTY_RASTER |
 990                        IRIS_DIRTY_STREAMOUT |
 991                        IRIS_DIRTY_TCS |
 992                        IRIS_DIRTY_TES |
 993                        IRIS_DIRTY_VS |
 994                        IRIS_DIRTY_WM;
 995 }
 996
 997 static void
 998 set_predicate_enable(struct iris_context *ice, bool value)
 999 {
1000    if (value)
1001       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1002    else
1003       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
1004 }
1005
1006 static void
1007 set_predicate_for_result(struct iris_context *ice,
1008                          struct iris_query *q,
1009                          bool inverted)
1010 {
1011    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
1012    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
1013
1014    /* The CPU doesn't have the query result yet; use hardware predication */
1015    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
1016
1017    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
1018    iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
1019    q->stalled = true;
1020
1021    switch (q->type) {
1022    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1023    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1024       overflow_result_to_gpr0(ice, q);
1025
1026       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
1027       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
1028       break;
1029    default:
1030       /* PIPE_QUERY_OCCLUSION_* */
1031       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
1032          offsetof(struct iris_query_snapshots, start) +
1033          q->query_state_ref.offset);
1034       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
1035          offsetof(struct iris_query_snapshots, end) +
1036          q->query_state_ref.offset);
1037       break;
1038    }
1039
1040    uint32_t mi_predicate = MI_PREDICATE |
1041                            MI_PREDICATE_COMBINEOP_SET |
1042                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
1043                            (inverted ? MI_PREDICATE_LOADOP_LOAD
1044                                      : MI_PREDICATE_LOADOP_LOADINV);
1045    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
1046
1047    /* We immediately set the predicate on the render batch, as all the
1048     * counters come from 3D operations.  However, we may need to predicate
1049     * a compute dispatch, which executes in a different GEM context and has
1050     * a different MI_PREDICATE_RESULT register.  So, we save the result to
1051     * memory and reload it in iris_launch_grid.
1052     */
1053    unsigned offset = q->query_state_ref.offset +
1054                      offsetof(struct iris_query_snapshots, predicate_result);
1055    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_RESULT,
1056                                   bo, offset, false);
1057    ice->state.compute_predicate = bo;
1058 }
1059
1060 static void
1061 iris_render_condition(struct pipe_context *ctx,
1062                       struct pipe_query *query,
1063                       boolean condition,
1064                       enum pipe_render_cond_flag mode)
1065 {
1066    struct iris_context *ice = (void *) ctx;
1067    struct iris_query *q = (void *) query;
1068
1069    /* The old condition isn't relevant; we'll update it if necessary */
1070    ice->state.compute_predicate = NULL;
1071    ice->condition.query = q;
1072    ice->condition.condition = condition;
1073
1074    if (!q) {
1075       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1076       return;
1077    }
1078
1079    iris_check_query_no_flush(ice, q);
1080
1081    if (q->result || q->ready) {
1082       set_predicate_enable(ice, (q->result != 0) ^ condition);
1083    } else {
1084       if (mode == PIPE_RENDER_COND_NO_WAIT ||
1085           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1086          perf_debug(&ice->dbg, "Conditional rendering demoted from "
1087                     "\"no wait\" to \"wait\".");
1088       }
1089       set_predicate_for_result(ice, q, condition);
1090    }
1091 }
1092
1093 void
1094 iris_resolve_conditional_render(struct iris_context *ice)
1095 {
1096    struct pipe_context *ctx = (void *) ice;
1097    struct iris_query *q = ice->condition.query;
1098    struct pipe_query *query = (void *) q;
1099    union pipe_query_result result;
1100
1101    if (ice->state.predicate != IRIS_PREDICATE_STATE_USE_BIT)
1102       return;
1103
1104    assert(q);
1105
1106    iris_get_query_result(ctx, query, true, &result);
1107    set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
1108 }
1109
1110 void
1111 iris_init_query_functions(struct pipe_context *ctx)
1112 {
1113    ctx->create_query = iris_create_query;
1114    ctx->destroy_query = iris_destroy_query;
1115    ctx->begin_query = iris_begin_query;
1116    ctx->end_query = iris_end_query;
1117    ctx->get_query_result = iris_get_query_result;
1118    ctx->get_query_result_resource = iris_get_query_result_resource;
1119    ctx->set_active_query_state = iris_set_active_query_state;
1120    ctx->render_condition = iris_render_condition;
1121 }