src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "iris_context.h"
  39 #include "iris_defines.h"
  40 #include "iris_resource.h"
  41 #include "iris_screen.h"
  42 #include "vulkan/util/vk_util.h"
  43
  44 #define IA_VERTICES_COUNT          0x2310
  45 #define IA_PRIMITIVES_COUNT        0x2318
  46 #define VS_INVOCATION_COUNT        0x2320
  47 #define HS_INVOCATION_COUNT        0x2300
  48 #define DS_INVOCATION_COUNT        0x2308
  49 #define GS_INVOCATION_COUNT        0x2328
  50 #define GS_PRIMITIVES_COUNT        0x2330
  51 #define CL_INVOCATION_COUNT        0x2338
  52 #define CL_PRIMITIVES_COUNT        0x2340
  53 #define PS_INVOCATION_COUNT        0x2348
  54 #define CS_INVOCATION_COUNT        0x2290
  55 #define PS_DEPTH_COUNT             0x2350
  56
  57 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  58
  59 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  60
  61 #define MI_MATH (0x1a << 23)
  62
  63 #define MI_ALU_LOAD      0x080
  64 #define MI_ALU_LOADINV   0x480
  65 #define MI_ALU_LOAD0     0x081
  66 #define MI_ALU_LOAD1     0x481
  67 #define MI_ALU_ADD       0x100
  68 #define MI_ALU_SUB       0x101
  69 #define MI_ALU_AND       0x102
  70 #define MI_ALU_OR        0x103
  71 #define MI_ALU_XOR       0x104
  72 #define MI_ALU_STORE     0x180
  73 #define MI_ALU_STOREINV  0x580
  74
  75 #define MI_ALU_R0        0x00
  76 #define MI_ALU_R1        0x01
  77 #define MI_ALU_R2        0x02
  78 #define MI_ALU_R3        0x03
  79 #define MI_ALU_R4        0x04
  80 #define MI_ALU_SRCA      0x20
  81 #define MI_ALU_SRCB      0x21
  82 #define MI_ALU_ACCU      0x31
  83 #define MI_ALU_ZF        0x32
  84 #define MI_ALU_CF        0x33
  85
  86 #define _MI_ALU(op, x, y)  (((op) << 20) | ((x) << 10) | (y))
  87
  88 #define _MI_ALU0(op)       _MI_ALU(MI_ALU_##op, 0, 0)
  89 #define _MI_ALU1(op, x)    _MI_ALU(MI_ALU_##op, x, 0)
  90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
  91
  92 #define MI_ALU0(op)        _MI_ALU0(op)
  93 #define MI_ALU1(op, x)     _MI_ALU1(op, MI_ALU_##x)
  94 #define MI_ALU2(op, x, y)  _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
  95
  96 #define emit_lri32 ice->vtbl.load_register_imm32
  97 #define emit_lri64 ice->vtbl.load_register_imm64
  98 #define emit_lrr32 ice->vtbl.load_register_reg32
  99
 100 struct iris_query {
 101    enum pipe_query_type type;
 102    int index;
 103
 104    bool ready;
 105
 106    bool stalled;
 107
 108    uint64_t result;
 109
 110    struct iris_bo *bo;
 111    struct iris_query_snapshots *map;
 112
 113    int batch_idx;
 114 };
 115
 116 struct iris_query_snapshots {
 117    /** iris_render_condition's saved MI_PREDICATE_DATA value. */
 118    uint64_t predicate_data;
 119
 120    /** Have the start/end snapshots landed? */
 121    uint64_t snapshots_landed;
 122
 123    /** Starting and ending counter snapshots */
 124    uint64_t start;
 125    uint64_t end;
 126 };
 127
 128 struct iris_query_so_overflow {
 129    uint64_t predicate_data;
 130    uint64_t snapshots_landed;
 131
 132    struct {
 133       uint64_t prim_storage_needed[2];
 134       uint64_t num_prims[2];
 135    } stream[4];
 136 };
 137
 138 /**
 139  * Is this type of query written by PIPE_CONTROL?
 140  */
 141 static bool
 142 iris_is_query_pipelined(struct iris_query *q)
 143 {
 144    switch (q->type) {
 145    case PIPE_QUERY_OCCLUSION_COUNTER:
 146    case PIPE_QUERY_OCCLUSION_PREDICATE:
 147    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 148    case PIPE_QUERY_TIMESTAMP:
 149    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 150    case PIPE_QUERY_TIME_ELAPSED:
 151       return true;
 152
 153    default:
 154       return false;
 155    }
 156 }
 157
 158 static void
 159 mark_available(struct iris_context *ice, struct iris_query *q)
 160 {
 161    struct iris_batch *batch = &ice->batches[q->batch_idx];
 162    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 163    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 164
 165    if (!iris_is_query_pipelined(q)) {
 166       ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
 167    } else {
 168       /* Order available *after* the query results. */
 169       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 170       iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
 171    }
 172 }
 173
 174 /**
 175  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 176  */
 177 static void
 178 iris_pipelined_write(struct iris_batch *batch,
 179                      struct iris_query *q,
 180                      enum pipe_control_flags flags,
 181                      unsigned offset)
 182 {
 183    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 184    const unsigned optional_cs_stall =
 185       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 186
 187    iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
 188                                 q->bo, offset, 0ull);
 189 }
 190
 191 static void
 192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 193 {
 194    struct iris_batch *batch = &ice->batches[q->batch_idx];
 195    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 196
 197    if (!iris_is_query_pipelined(q)) {
 198       iris_emit_pipe_control_flush(batch,
 199                                    PIPE_CONTROL_CS_STALL |
 200                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 201       q->stalled = true;
 202    }
 203
 204    switch (q->type) {
 205    case PIPE_QUERY_OCCLUSION_COUNTER:
 206    case PIPE_QUERY_OCCLUSION_PREDICATE:
 207    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 208       if (devinfo->gen >= 10) {
 209          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 210           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 211           *  Count sync operation."
 212           */
 213          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
 214       }
 215       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 216                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 217                            PIPE_CONTROL_DEPTH_STALL,
 218                            offset);
 219       break;
 220    case PIPE_QUERY_TIME_ELAPSED:
 221    case PIPE_QUERY_TIMESTAMP:
 222    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 223       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 224                            PIPE_CONTROL_WRITE_TIMESTAMP,
 225                            offset);
 226       break;
 227    case PIPE_QUERY_PRIMITIVES_GENERATED:
 228       ice->vtbl.store_register_mem64(batch,
 229                                      q->index == 0 ? CL_INVOCATION_COUNT :
 230                                      SO_PRIM_STORAGE_NEEDED(q->index),
 231                                      q->bo, offset, false);
 232       break;
 233    case PIPE_QUERY_PRIMITIVES_EMITTED:
 234       ice->vtbl.store_register_mem64(batch,
 235                                      SO_NUM_PRIMS_WRITTEN(q->index),
 236                                      q->bo, offset, false);
 237       break;
 238    case PIPE_QUERY_PIPELINE_STATISTICS: {
 239       static const uint32_t index_to_reg[] = {
 240          IA_VERTICES_COUNT,
 241          IA_PRIMITIVES_COUNT,
 242          VS_INVOCATION_COUNT,
 243          GS_INVOCATION_COUNT,
 244          GS_PRIMITIVES_COUNT,
 245          CL_INVOCATION_COUNT,
 246          CL_PRIMITIVES_COUNT,
 247          PS_INVOCATION_COUNT,
 248          HS_INVOCATION_COUNT,
 249          DS_INVOCATION_COUNT,
 250          CS_INVOCATION_COUNT,
 251       };
 252       const uint32_t reg = index_to_reg[q->index];
 253
 254       ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
 255       break;
 256    }
 257    default:
 258       assert(false);
 259    }
 260 }
 261
 262 static void
 263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 264 {
 265    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 266    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 267
 268    iris_emit_pipe_control_flush(batch,
 269                                 PIPE_CONTROL_CS_STALL |
 270                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 271    for (uint32_t i = 0; i < count; i++) {
 272       int s = q->index + i;
 273       int g_idx = offsetof(struct iris_query_so_overflow,
 274                            stream[s].num_prims[end]);
 275       int w_idx = offsetof(struct iris_query_so_overflow,
 276                            stream[s].prim_storage_needed[end]);
 277       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 278                                      q->bo, g_idx, false);
 279       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 280                                      q->bo, w_idx, false);
 281    }
 282 }
 283
 284 uint64_t
 285 iris_timebase_scale(const struct gen_device_info *devinfo,
 286                     uint64_t gpu_timestamp)
 287 {
 288    return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 289 }
 290
 291 static uint64_t
 292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 293 {
 294    if (time0 > time1) {
 295       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 296    } else {
 297       return time1 - time0;
 298    }
 299 }
 300
 301 static bool
 302 stream_overflowed(struct iris_query_so_overflow *so, int s)
 303 {
 304    return (so->stream[s].prim_storage_needed[1] -
 305            so->stream[s].prim_storage_needed[0]) !=
 306           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 307 }
 308
 309 static void
 310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 311                         struct iris_query *q)
 312 {
 313    switch (q->type) {
 314    case PIPE_QUERY_OCCLUSION_PREDICATE:
 315    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 316       q->result = q->map->end != q->map->start;
 317       break;
 318    case PIPE_QUERY_TIMESTAMP:
 319    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 320       /* The timestamp is the single starting snapshot. */
 321       q->result = iris_timebase_scale(devinfo, q->map->start);
 322       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 323       break;
 324    case PIPE_QUERY_TIME_ELAPSED:
 325       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 326       q->result = iris_timebase_scale(devinfo, q->result);
 327       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 328       break;
 329    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 330       q->result = stream_overflowed((void *) q->map, q->index);
 331       break;
 332    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 333       q->result = false;
 334       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 335          q->result |= stream_overflowed((void *) q->map, i);
 336       break;
 337    case PIPE_QUERY_OCCLUSION_COUNTER:
 338    case PIPE_QUERY_PRIMITIVES_GENERATED:
 339    case PIPE_QUERY_PRIMITIVES_EMITTED:
 340    case PIPE_QUERY_PIPELINE_STATISTICS:
 341    default:
 342       q->result = q->map->end - q->map->start;
 343       break;
 344    }
 345
 346    q->ready = true;
 347 }
 348
 349 static void
 350 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 351              unsigned reg_a, unsigned reg_b)
 352 {
 353    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 354
 355    math[0] = MI_MATH | (5 - 2);
 356    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 357    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 358    math[3] = _MI_ALU0(ADD);
 359    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 360 }
 361
 362 static void
 363 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 364              unsigned src_reg, unsigned shift)
 365 {
 366    assert(shift > 0);
 367
 368    int dwords = 1 + 4 * shift;
 369
 370    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 371
 372    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 373
 374    for (unsigned i = 0; i < shift; i++) {
 375       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 376       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 377       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 378       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 379       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 380    }
 381 }
 382
 383 /* Emit dwords to multiply GPR0 by N */
 384 static void
 385 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 386 {
 387    VK_OUTARRAY_MAKE(out, dw, dw_count);
 388
 389 #define APPEND_ALU(op, x, y) \
 390    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 391
 392    assert(N > 0);
 393    unsigned top_bit = 31 - __builtin_clz(N);
 394    for (int i = top_bit - 1; i >= 0; i--) {
 395       /* We get our initial data in GPR0 and we write the final data out to
 396        * GPR0 but we use GPR1 as our scratch register.
 397        */
 398       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 399       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 400
 401       /* Shift the current value left by 1 */
 402       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 403       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 404       APPEND_ALU(ADD, 0, 0);
 405
 406       if (N & (1 << i)) {
 407          /* Store ACCU to R1 and add R0 to R1 */
 408          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 409          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 410          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 411          APPEND_ALU(ADD, 0, 0);
 412       }
 413
 414       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 415    }
 416
 417 #undef APPEND_ALU
 418 }
 419
 420 static void
 421 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 422 {
 423    uint32_t num_dwords;
 424    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 425
 426    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 427    math[0] = MI_MATH | (num_dwords - 2);
 428    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 429 }
 430
 431 void
 432 iris_math_div32_gpr0(struct iris_context *ice,
 433                      struct iris_batch *batch,
 434                      uint32_t D)
 435 {
 436    /* Zero out the top of GPR0 */
 437    emit_lri32(batch, CS_GPR(0) + 4, 0);
 438
 439    if (D == 0) {
 440       /* This invalid, but we should do something so we set GPR0 to 0. */
 441       emit_lri32(batch, CS_GPR(0), 0);
 442    } else if (util_is_power_of_two_or_zero(D)) {
 443       unsigned log2_D = util_logbase2(D);
 444       assert(log2_D < 32);
 445       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 446        * the top 32 bits of the result.
 447        */
 448       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 449       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 450       emit_lri32(batch, CS_GPR(0) + 4, 0);
 451    } else {
 452       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 453       assert(m.multiplier <= UINT32_MAX);
 454
 455       if (m.pre_shift) {
 456          /* We right-shift by L by left-shifting by 32 - l and taking the top
 457           * 32 bits of the result.
 458           */
 459          if (m.pre_shift < 32)
 460             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 461          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 462          emit_lri32(batch, CS_GPR(0) + 4, 0);
 463       }
 464
 465       /* Do the 32x32 multiply into gpr0 */
 466       emit_mul_gpr0(batch, m.multiplier);
 467
 468       if (m.increment) {
 469          /* If we need to increment, save off a copy of GPR0 */
 470          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 471          emit_lri32(batch, CS_GPR(1) + 4, 0);
 472          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 473       }
 474
 475       /* Shift by 32 */
 476       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 477       emit_lri32(batch, CS_GPR(0) + 4, 0);
 478
 479       if (m.post_shift) {
 480          /* We right-shift by L by left-shifting by 32 - l and taking the top
 481           * 32 bits of the result.
 482           */
 483          if (m.post_shift < 32)
 484             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 485          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 486          emit_lri32(batch, CS_GPR(0) + 4, 0);
 487       }
 488    }
 489 }
 490
 491 /*
 492  * GPR0 = (GPR0 == 0) ? 0 : 1;
 493  */
 494 static void
 495 gpr0_to_bool(struct iris_context *ice)
 496 {
 497    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 498
 499    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 500
 501    static const uint32_t math[] = {
 502       MI_MATH | (9 - 2),
 503       MI_ALU2(LOAD, SRCA, R0),
 504       MI_ALU1(LOAD0, SRCB),
 505       MI_ALU0(ADD),
 506       MI_ALU2(STOREINV, R0, ZF),
 507       MI_ALU2(LOAD, SRCA, R0),
 508       MI_ALU2(LOAD, SRCB, R1),
 509       MI_ALU0(AND),
 510       MI_ALU2(STORE, R0, ACCU),
 511    };
 512    iris_batch_emit(batch, math, sizeof(math));
 513 }
 514
 515 static void
 516 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 517                               struct iris_query *q,
 518                               int idx)
 519 {
 520    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 521
 522    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 523                                  offsetof(struct iris_query_so_overflow,
 524                                           stream[idx].prim_storage_needed[0]));
 525    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 526                                  offsetof(struct iris_query_so_overflow,
 527                                           stream[idx].prim_storage_needed[1]));
 528
 529    ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
 530                                  offsetof(struct iris_query_so_overflow,
 531                                           stream[idx].num_prims[0]));
 532    ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
 533                                  offsetof(struct iris_query_so_overflow,
 534                                           stream[idx].num_prims[1]));
 535 }
 536
 537 /*
 538  * R3 = R4 - R3;
 539  * R1 = R2 - R1;
 540  * R1 = R3 - R1;
 541  * R0 = R0 | R1;
 542  */
 543 static void
 544 calc_overflow_for_stream(struct iris_context *ice)
 545 {
 546    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 547    static const uint32_t maths[] = {
 548       MI_MATH | (17 - 2),
 549       MI_ALU2(LOAD, SRCA, R4),
 550       MI_ALU2(LOAD, SRCB, R3),
 551       MI_ALU0(SUB),
 552       MI_ALU2(STORE, R3, ACCU),
 553       MI_ALU2(LOAD, SRCA, R2),
 554       MI_ALU2(LOAD, SRCB, R1),
 555       MI_ALU0(SUB),
 556       MI_ALU2(STORE, R1, ACCU),
 557       MI_ALU2(LOAD, SRCA, R3),
 558       MI_ALU2(LOAD, SRCB, R1),
 559       MI_ALU0(SUB),
 560       MI_ALU2(STORE, R1, ACCU),
 561       MI_ALU2(LOAD, SRCA, R1),
 562       MI_ALU2(LOAD, SRCB, R0),
 563       MI_ALU0(OR),
 564       MI_ALU2(STORE, R0, ACCU),
 565    };
 566
 567    iris_batch_emit(batch, maths, sizeof(maths));
 568 }
 569
 570 static void
 571 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 572 {
 573    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 574
 575    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 576
 577    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 578       load_overflow_data_to_cs_gprs(ice, q, q->index);
 579       calc_overflow_for_stream(ice);
 580    } else {
 581       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 582          load_overflow_data_to_cs_gprs(ice, q, i);
 583          calc_overflow_for_stream(ice);
 584       }
 585    }
 586
 587    gpr0_to_bool(ice);
 588 }
 589
 590 /**
 591  * Calculate the result and store it to CS_GPR0.
 592  */
 593 static void
 594 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 595 {
 596    struct iris_batch *batch = &ice->batches[q->batch_idx];
 597
 598    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 599        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 600       overflow_result_to_gpr0(ice, q);
 601       return;
 602    }
 603
 604    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 605                                  offsetof(struct iris_query_snapshots, start));
 606    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 607                                  offsetof(struct iris_query_snapshots, end));
 608
 609    static const uint32_t math[] = {
 610       MI_MATH | (5 - 2),
 611       MI_ALU2(LOAD, SRCA, R2),
 612       MI_ALU2(LOAD, SRCB, R1),
 613       MI_ALU0(SUB),
 614       MI_ALU2(STORE, R0, ACCU),
 615    };
 616    iris_batch_emit(batch, math, sizeof(math));
 617
 618    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 619        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 620       gpr0_to_bool(ice);
 621 }
 622
 623 static struct pipe_query *
 624 iris_create_query(struct pipe_context *ctx,
 625                   unsigned query_type,
 626                   unsigned index)
 627 {
 628    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 629
 630    q->type = query_type;
 631    q->index = index;
 632
 633    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
 634       q->batch_idx = IRIS_BATCH_COMPUTE;
 635    else
 636       q->batch_idx = IRIS_BATCH_RENDER;
 637    return (struct pipe_query *) q;
 638 }
 639
 640 static void
 641 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 642 {
 643    struct iris_query *query = (void *) p_query;
 644    iris_bo_unreference(query->bo);
 645    free(query);
 646 }
 647
 648
 649 static boolean
 650 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 651 {
 652    struct iris_screen *screen = (void *) ctx->screen;
 653    struct iris_context *ice = (void *) ctx;
 654    struct iris_query *q = (void *) query;
 655
 656    iris_bo_unreference(q->bo);
 657    q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
 658                          IRIS_MEMZONE_OTHER);
 659    if (!q->bo)
 660       return false;
 661
 662    q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
 663    if (!q->map)
 664       return false;
 665
 666    q->result = 0ull;
 667    q->ready = false;
 668    q->map->snapshots_landed = false;
 669
 670    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 671       ice->state.prims_generated_query_active = true;
 672       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 673    }
 674
 675    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 676        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 677       write_overflow_values(ice, q, false);
 678    else
 679       write_value(ice, q, offsetof(struct iris_query_snapshots, start));
 680
 681    return true;
 682 }
 683
 684 static bool
 685 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 686 {
 687    struct iris_context *ice = (void *) ctx;
 688    struct iris_query *q = (void *) query;
 689
 690    if (q->type == PIPE_QUERY_TIMESTAMP) {
 691       iris_begin_query(ctx, query);
 692       mark_available(ice, q);
 693       return true;
 694    }
 695
 696    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 697       ice->state.prims_generated_query_active = false;
 698       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 699    }
 700
 701    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 702        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 703       write_overflow_values(ice, q, true);
 704    else
 705       write_value(ice, q, offsetof(struct iris_query_snapshots, end));
 706    mark_available(ice, q);
 707
 708    return true;
 709 }
 710
 711 /**
 712  * See if the snapshots have landed for a query, and if so, compute the
 713  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 714  */
 715 static void
 716 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 717 {
 718    struct iris_screen *screen = (void *) ice->ctx.screen;
 719    const struct gen_device_info *devinfo = &screen->devinfo;
 720
 721    if (!q->ready && q->map->snapshots_landed) {
 722       calculate_result_on_cpu(devinfo, q);
 723    }
 724 }
 725
 726 static boolean
 727 iris_get_query_result(struct pipe_context *ctx,
 728                       struct pipe_query *query,
 729                       boolean wait,
 730                       union pipe_query_result *result)
 731 {
 732    struct iris_context *ice = (void *) ctx;
 733    struct iris_query *q = (void *) query;
 734    struct iris_screen *screen = (void *) ctx->screen;
 735    const struct gen_device_info *devinfo = &screen->devinfo;
 736
 737    if (!q->ready) {
 738       if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
 739          iris_batch_flush(&ice->batches[q->batch_idx]);
 740
 741       if (!q->map->snapshots_landed) {
 742          if (wait)
 743             iris_bo_wait_rendering(q->bo);
 744          else
 745             return false;
 746       }
 747
 748       assert(q->map->snapshots_landed);
 749       calculate_result_on_cpu(devinfo, q);
 750    }
 751
 752    assert(q->ready);
 753
 754    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
 755       switch (q->index) {
 756       case 0:
 757          result->pipeline_statistics.ia_vertices = q->result;
 758          break;
 759       case 1:
 760          result->pipeline_statistics.ia_primitives = q->result;
 761          break;
 762       case 2:
 763          result->pipeline_statistics.vs_invocations = q->result;
 764          break;
 765       case 3:
 766          result->pipeline_statistics.gs_invocations = q->result;
 767          break;
 768       case 4:
 769          result->pipeline_statistics.gs_primitives = q->result;
 770          break;
 771       case 5:
 772          result->pipeline_statistics.c_invocations = q->result;
 773          break;
 774       case 6:
 775          result->pipeline_statistics.c_primitives = q->result;
 776          break;
 777       case 7:
 778          result->pipeline_statistics.ps_invocations = q->result;
 779          break;
 780       case 8:
 781          result->pipeline_statistics.hs_invocations = q->result;
 782          break;
 783       case 9:
 784          result->pipeline_statistics.ds_invocations = q->result;
 785          break;
 786       case 10:
 787          result->pipeline_statistics.cs_invocations = q->result;
 788          break;
 789       }
 790    } else {
 791       result->u64 = q->result;
 792    }
 793
 794    return true;
 795 }
 796
 797 static void
 798 iris_get_query_result_resource(struct pipe_context *ctx,
 799                                struct pipe_query *query,
 800                                boolean wait,
 801                                enum pipe_query_value_type result_type,
 802                                int index,
 803                                struct pipe_resource *p_res,
 804                                unsigned offset)
 805 {
 806    struct iris_context *ice = (void *) ctx;
 807    struct iris_query *q = (void *) query;
 808    struct iris_batch *batch = &ice->batches[q->batch_idx];
 809    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 810    struct iris_resource *res = (void *) p_res;
 811    unsigned snapshots_landed_offset =
 812       offsetof(struct iris_query_snapshots, snapshots_landed);
 813
 814    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 815
 816    if (index == -1) {
 817       /* They're asking for the availability of the result.  If we still
 818        * have commands queued up which produce the result, submit them
 819        * now so that progress happens.  Either way, copy the snapshots
 820        * landed field to the destination resource.
 821        */
 822       if (iris_batch_references(batch, q->bo))
 823          iris_batch_flush(batch);
 824
 825       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 826                              q->bo, snapshots_landed_offset,
 827                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 828       return;
 829    }
 830
 831    if (!q->ready && q->map->snapshots_landed) {
 832       /* The final snapshots happen to have landed, so let's just compute
 833        * the result on the CPU now...
 834        */
 835       calculate_result_on_cpu(devinfo, q);
 836    }
 837
 838    if (q->ready) {
 839       /* We happen to have the result on the CPU, so just copy it. */
 840       if (result_type <= PIPE_QUERY_TYPE_U32) {
 841          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 842                                     q->result);
 843       } else {
 844          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 845                                     q->result);
 846       }
 847
 848       /* Make sure the result lands before they use bind the QBO elsewhere
 849        * and use the result.
 850        */
 851       // XXX: Why?  i965 doesn't do this.
 852       iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
 853       return;
 854    }
 855
 856    /* Calculate the result to CS_GPR0 */
 857    calculate_result_on_gpu(ice, q);
 858
 859    bool predicated = !wait && !q->stalled;
 860
 861    if (predicated) {
 862       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 863       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 864                                     snapshots_landed_offset);
 865       uint32_t predicate = MI_PREDICATE |
 866                            MI_PREDICATE_LOADOP_LOADINV |
 867                            MI_PREDICATE_COMBINEOP_SET |
 868                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 869       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 870    }
 871
 872    if (result_type <= PIPE_QUERY_TYPE_U32) {
 873       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 874                                      iris_resource_bo(p_res),
 875                                      offset, predicated);
 876    } else {
 877       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 878                                      iris_resource_bo(p_res),
 879                                      offset, predicated);
 880    }
 881 }
 882
 883 static void
 884 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 885 {
 886    struct iris_context *ice = (void *) ctx;
 887
 888    if (ice->state.statistics_counters_enabled == enable)
 889       return;
 890
 891    // XXX: most packets aren't paying attention to this yet, because it'd
 892    // have to be done dynamically at draw time, which is a pain
 893    ice->state.statistics_counters_enabled = enable;
 894    ice->state.dirty |= IRIS_DIRTY_CLIP |
 895                        IRIS_DIRTY_GS |
 896                        IRIS_DIRTY_RASTER |
 897                        IRIS_DIRTY_STREAMOUT |
 898                        IRIS_DIRTY_TCS |
 899                        IRIS_DIRTY_TES |
 900                        IRIS_DIRTY_VS |
 901                        IRIS_DIRTY_WM;
 902 }
 903
 904 static void
 905 set_predicate_enable(struct iris_context *ice, bool value)
 906 {
 907    if (value)
 908       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
 909    else
 910       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
 911 }
 912
 913 static void
 914 set_predicate_for_result(struct iris_context *ice,
 915                          struct iris_query *q,
 916                          bool inverted)
 917 {
 918    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 919
 920    /* The CPU doesn't have the query result yet; use hardware predication */
 921    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
 922
 923    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
 924    iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
 925    q->stalled = true;
 926
 927    switch (q->type) {
 928    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 929    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 930       overflow_result_to_gpr0(ice, q);
 931
 932       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
 933       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 934       break;
 935    default:
 936       /* PIPE_QUERY_OCCLUSION_* */
 937       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 938          offsetof(struct iris_query_snapshots, start));
 939       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
 940          offsetof(struct iris_query_snapshots, end));
 941       break;
 942    }
 943
 944    uint32_t mi_predicate = MI_PREDICATE |
 945                            MI_PREDICATE_COMBINEOP_SET |
 946                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
 947                            (inverted ? MI_PREDICATE_LOADOP_LOAD
 948                                      : MI_PREDICATE_LOADOP_LOADINV);
 949    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
 950
 951    /* We immediately set the predicate on the render batch, as all the
 952     * counters come from 3D operations.  However, we may need to predicate
 953     * a compute dispatch, which executes in a different GEM context and has
 954     * a different MI_PREDICATE_DATA register.  So, we save the result to
 955     * memory and reload it in iris_launch_grid.
 956     */
 957    unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
 958    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
 959                                   q->bo, offset, false);
 960    ice->state.compute_predicate = q->bo;
 961 }
 962
 963 static void
 964 iris_render_condition(struct pipe_context *ctx,
 965                       struct pipe_query *query,
 966                       boolean condition,
 967                       enum pipe_render_cond_flag mode)
 968 {
 969    struct iris_context *ice = (void *) ctx;
 970    struct iris_query *q = (void *) query;
 971
 972    if (!q) {
 973       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
 974       return;
 975    }
 976
 977    iris_check_query_no_flush(ice, q);
 978
 979    if (q->result || q->ready) {
 980       set_predicate_enable(ice, (q->result != 0) ^ condition);
 981    } else {
 982       if (mode == PIPE_RENDER_COND_NO_WAIT ||
 983           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
 984          perf_debug(&ice->dbg, "Conditional rendering demoted from "
 985                     "\"no wait\" to \"wait\".");
 986       }
 987       set_predicate_for_result(ice, q, condition);
 988    }
 989 }
 990
 991 void
 992 iris_init_query_functions(struct pipe_context *ctx)
 993 {
 994    ctx->create_query = iris_create_query;
 995    ctx->destroy_query = iris_destroy_query;
 996    ctx->begin_query = iris_begin_query;
 997    ctx->end_query = iris_end_query;
 998    ctx->get_query_result = iris_get_query_result;
 999    ctx->get_query_result_resource = iris_get_query_result_resource;
1000    ctx->set_active_query_state = iris_set_active_query_state;
1001    ctx->render_condition = iris_render_condition;
1002 }