src/gallium/drivers/iris/iris_query.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_query.c
  25  *
  26  * Query object support.  This allows measuring various simple statistics
  27  * via counters on the GPU.
  28  */
  29
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #include "pipe/p_defines.h"
  33 #include "pipe/p_state.h"
  34 #include "pipe/p_context.h"
  35 #include "pipe/p_screen.h"
  36 #include "util/fast_idiv_by_const.h"
  37 #include "util/u_inlines.h"
  38 #include "iris_context.h"
  39 #include "iris_defines.h"
  40 #include "iris_resource.h"
  41 #include "iris_screen.h"
  42 #include "vulkan/util/vk_util.h"
  43
  44 #define IA_VERTICES_COUNT          0x2310
  45 #define IA_PRIMITIVES_COUNT        0x2318
  46 #define VS_INVOCATION_COUNT        0x2320
  47 #define HS_INVOCATION_COUNT        0x2300
  48 #define DS_INVOCATION_COUNT        0x2308
  49 #define GS_INVOCATION_COUNT        0x2328
  50 #define GS_PRIMITIVES_COUNT        0x2330
  51 #define CL_INVOCATION_COUNT        0x2338
  52 #define CL_PRIMITIVES_COUNT        0x2340
  53 #define PS_INVOCATION_COUNT        0x2348
  54 #define CS_INVOCATION_COUNT        0x2290
  55 #define PS_DEPTH_COUNT             0x2350
  56
  57 #define SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
  58
  59 #define SO_NUM_PRIMS_WRITTEN(n)    (0x5200 + (n) * 8)
  60
  61 #define MI_MATH (0x1a << 23)
  62
  63 #define MI_ALU_LOAD      0x080
  64 #define MI_ALU_LOADINV   0x480
  65 #define MI_ALU_LOAD0     0x081
  66 #define MI_ALU_LOAD1     0x481
  67 #define MI_ALU_ADD       0x100
  68 #define MI_ALU_SUB       0x101
  69 #define MI_ALU_AND       0x102
  70 #define MI_ALU_OR        0x103
  71 #define MI_ALU_XOR       0x104
  72 #define MI_ALU_STORE     0x180
  73 #define MI_ALU_STOREINV  0x580
  74
  75 #define MI_ALU_R0        0x00
  76 #define MI_ALU_R1        0x01
  77 #define MI_ALU_R2        0x02
  78 #define MI_ALU_R3        0x03
  79 #define MI_ALU_R4        0x04
  80 #define MI_ALU_SRCA      0x20
  81 #define MI_ALU_SRCB      0x21
  82 #define MI_ALU_ACCU      0x31
  83 #define MI_ALU_ZF        0x32
  84 #define MI_ALU_CF        0x33
  85
  86 #define _MI_ALU(op, x, y)  (((op) << 20) | ((x) << 10) | (y))
  87
  88 #define _MI_ALU0(op)       _MI_ALU(MI_ALU_##op, 0, 0)
  89 #define _MI_ALU1(op, x)    _MI_ALU(MI_ALU_##op, x, 0)
  90 #define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y)
  91
  92 #define MI_ALU0(op)        _MI_ALU0(op)
  93 #define MI_ALU1(op, x)     _MI_ALU1(op, MI_ALU_##x)
  94 #define MI_ALU2(op, x, y)  _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y)
  95
  96 #define emit_lri32 ice->vtbl.load_register_imm32
  97 #define emit_lri64 ice->vtbl.load_register_imm64
  98 #define emit_lrr32 ice->vtbl.load_register_reg32
  99
 100 struct iris_query {
 101    enum pipe_query_type type;
 102    int index;
 103
 104    bool ready;
 105
 106    bool stalled;
 107
 108    uint64_t result;
 109
 110    struct iris_bo *bo;
 111    struct iris_query_snapshots *map;
 112
 113    int batch_idx;
 114 };
 115
 116 struct iris_query_snapshots {
 117    /** iris_render_condition's saved MI_PREDICATE_DATA value. */
 118    uint64_t predicate_data;
 119
 120    /** Have the start/end snapshots landed? */
 121    uint64_t snapshots_landed;
 122
 123    /** Starting and ending counter snapshots */
 124    uint64_t start;
 125    uint64_t end;
 126 };
 127
 128 struct iris_query_so_overflow {
 129    uint64_t predicate_data;
 130    uint64_t snapshots_landed;
 131
 132    struct {
 133       uint64_t prim_storage_needed[2];
 134       uint64_t num_prims[2];
 135    } stream[4];
 136 };
 137
 138 /**
 139  * Is this type of query written by PIPE_CONTROL?
 140  */
 141 static bool
 142 iris_is_query_pipelined(struct iris_query *q)
 143 {
 144    switch (q->type) {
 145    case PIPE_QUERY_OCCLUSION_COUNTER:
 146    case PIPE_QUERY_OCCLUSION_PREDICATE:
 147    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 148    case PIPE_QUERY_TIMESTAMP:
 149    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 150    case PIPE_QUERY_TIME_ELAPSED:
 151       return true;
 152
 153    default:
 154       return false;
 155    }
 156 }
 157
 158 static void
 159 mark_available(struct iris_context *ice, struct iris_query *q)
 160 {
 161    struct iris_batch *batch = &ice->batches[q->batch_idx];
 162    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
 163    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
 164
 165    if (!iris_is_query_pipelined(q)) {
 166       ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
 167    } else {
 168       /* Order available *after* the query results. */
 169       flags |= PIPE_CONTROL_FLUSH_ENABLE;
 170       iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
 171    }
 172 }
 173
 174 /**
 175  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
 176  */
 177 static void
 178 iris_pipelined_write(struct iris_batch *batch,
 179                      struct iris_query *q,
 180                      enum pipe_control_flags flags,
 181                      unsigned offset)
 182 {
 183    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 184    const unsigned optional_cs_stall =
 185       devinfo->gen == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
 186
 187    iris_emit_pipe_control_write(batch, flags | optional_cs_stall,
 188                                 q->bo, offset, 0ull);
 189 }
 190
 191 static void
 192 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
 193 {
 194    struct iris_batch *batch = &ice->batches[q->batch_idx];
 195    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 196
 197    if (!iris_is_query_pipelined(q)) {
 198       iris_emit_pipe_control_flush(batch,
 199                                    PIPE_CONTROL_CS_STALL |
 200                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
 201       q->stalled = true;
 202    }
 203
 204    switch (q->type) {
 205    case PIPE_QUERY_OCCLUSION_COUNTER:
 206    case PIPE_QUERY_OCCLUSION_PREDICATE:
 207    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 208       if (devinfo->gen >= 10) {
 209          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
 210           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
 211           *  Count sync operation."
 212           */
 213          iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
 214       }
 215       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 216                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
 217                            PIPE_CONTROL_DEPTH_STALL,
 218                            offset);
 219       break;
 220    case PIPE_QUERY_TIME_ELAPSED:
 221    case PIPE_QUERY_TIMESTAMP:
 222    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 223       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
 224                            PIPE_CONTROL_WRITE_TIMESTAMP,
 225                            offset);
 226       break;
 227    case PIPE_QUERY_PRIMITIVES_GENERATED:
 228       ice->vtbl.store_register_mem64(batch,
 229                                      q->index == 0 ? CL_INVOCATION_COUNT :
 230                                      SO_PRIM_STORAGE_NEEDED(q->index),
 231                                      q->bo, offset, false);
 232       break;
 233    case PIPE_QUERY_PRIMITIVES_EMITTED:
 234       ice->vtbl.store_register_mem64(batch,
 235                                      SO_NUM_PRIMS_WRITTEN(q->index),
 236                                      q->bo, offset, false);
 237       break;
 238    case PIPE_QUERY_PIPELINE_STATISTICS: {
 239       static const uint32_t index_to_reg[] = {
 240          IA_VERTICES_COUNT,
 241          IA_PRIMITIVES_COUNT,
 242          VS_INVOCATION_COUNT,
 243          GS_INVOCATION_COUNT,
 244          GS_PRIMITIVES_COUNT,
 245          CL_INVOCATION_COUNT,
 246          CL_PRIMITIVES_COUNT,
 247          PS_INVOCATION_COUNT,
 248          HS_INVOCATION_COUNT,
 249          DS_INVOCATION_COUNT,
 250          CS_INVOCATION_COUNT,
 251       };
 252       const uint32_t reg = index_to_reg[q->index];
 253
 254       ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
 255       break;
 256    }
 257    default:
 258       assert(false);
 259    }
 260 }
 261
 262 static void
 263 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
 264 {
 265    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 266    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
 267
 268    iris_emit_pipe_control_flush(batch,
 269                                 PIPE_CONTROL_CS_STALL |
 270                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 271    for (uint32_t i = 0; i < count; i++) {
 272       int s = q->index + i;
 273       int g_idx = offsetof(struct iris_query_so_overflow,
 274                            stream[s].num_prims[end]);
 275       int w_idx = offsetof(struct iris_query_so_overflow,
 276                            stream[s].prim_storage_needed[end]);
 277       ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
 278                                      q->bo, g_idx, false);
 279       ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
 280                                      q->bo, w_idx, false);
 281    }
 282 }
 283
 284 uint64_t
 285 iris_timebase_scale(const struct gen_device_info *devinfo,
 286                     uint64_t gpu_timestamp)
 287 {
 288    return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
 289 }
 290
 291 static uint64_t
 292 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
 293 {
 294    if (time0 > time1) {
 295       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
 296    } else {
 297       return time1 - time0;
 298    }
 299 }
 300
 301 static bool
 302 stream_overflowed(struct iris_query_so_overflow *so, int s)
 303 {
 304    return (so->stream[s].prim_storage_needed[1] -
 305            so->stream[s].prim_storage_needed[0]) !=
 306           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
 307 }
 308
 309 static void
 310 calculate_result_on_cpu(const struct gen_device_info *devinfo,
 311                         struct iris_query *q)
 312 {
 313    switch (q->type) {
 314    case PIPE_QUERY_OCCLUSION_PREDICATE:
 315    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 316       q->result = q->map->end != q->map->start;
 317       break;
 318    case PIPE_QUERY_TIMESTAMP:
 319    case PIPE_QUERY_TIMESTAMP_DISJOINT:
 320       /* The timestamp is the single starting snapshot. */
 321       q->result = iris_timebase_scale(devinfo, q->map->start);
 322       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 323       break;
 324    case PIPE_QUERY_TIME_ELAPSED:
 325       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
 326       q->result = iris_timebase_scale(devinfo, q->result);
 327       q->result &= (1ull << TIMESTAMP_BITS) - 1;
 328       break;
 329    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 330       q->result = stream_overflowed((void *) q->map, q->index);
 331       break;
 332    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 333       q->result = false;
 334       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
 335          q->result |= stream_overflowed((void *) q->map, i);
 336       break;
 337    case PIPE_QUERY_OCCLUSION_COUNTER:
 338    case PIPE_QUERY_PRIMITIVES_GENERATED:
 339    case PIPE_QUERY_PRIMITIVES_EMITTED:
 340    case PIPE_QUERY_PIPELINE_STATISTICS:
 341    default:
 342       q->result = q->map->end - q->map->start;
 343       break;
 344    }
 345
 346    q->ready = true;
 347 }
 348
 349 static void
 350 emit_alu_add(struct iris_batch *batch, unsigned dst_reg,
 351              unsigned reg_a, unsigned reg_b)
 352 {
 353    uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t));
 354
 355    math[0] = MI_MATH | (5 - 2);
 356    math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a);
 357    math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b);
 358    math[3] = _MI_ALU0(ADD);
 359    math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 360 }
 361
 362 static void
 363 emit_alu_shl(struct iris_batch *batch, unsigned dst_reg,
 364              unsigned src_reg, unsigned shift)
 365 {
 366    assert(shift > 0);
 367
 368    int dwords = 1 + 4 * shift;
 369
 370    uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords);
 371
 372    math[0] = MI_MATH | ((1 + 4 * shift) - 2);
 373
 374    for (unsigned i = 0; i < shift; i++) {
 375       unsigned add_src = (i == 0) ? src_reg : dst_reg;
 376       math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src);
 377       math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src);
 378       math[1 + (i * 4) + 2] = _MI_ALU0(ADD);
 379       math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU);
 380    }
 381 }
 382
 383 /* Emit dwords to multiply GPR0 by N */
 384 static void
 385 build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N)
 386 {
 387    VK_OUTARRAY_MAKE(out, dw, dw_count);
 388
 389 #define APPEND_ALU(op, x, y) \
 390    vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y)
 391
 392    assert(N > 0);
 393    unsigned top_bit = 31 - __builtin_clz(N);
 394    for (int i = top_bit - 1; i >= 0; i--) {
 395       /* We get our initial data in GPR0 and we write the final data out to
 396        * GPR0 but we use GPR1 as our scratch register.
 397        */
 398       unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1;
 399       unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1;
 400
 401       /* Shift the current value left by 1 */
 402       APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg);
 403       APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg);
 404       APPEND_ALU(ADD, 0, 0);
 405
 406       if (N & (1 << i)) {
 407          /* Store ACCU to R1 and add R0 to R1 */
 408          APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU);
 409          APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0);
 410          APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1);
 411          APPEND_ALU(ADD, 0, 0);
 412       }
 413
 414       APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU);
 415    }
 416
 417 #undef APPEND_ALU
 418 }
 419
 420 static void
 421 emit_mul_gpr0(struct iris_batch *batch, uint32_t N)
 422 {
 423    uint32_t num_dwords;
 424    build_alu_multiply_gpr0(NULL, &num_dwords, N);
 425
 426    uint32_t *math = iris_get_command_space(batch, 4 * num_dwords);
 427    math[0] = MI_MATH | (num_dwords - 2);
 428    build_alu_multiply_gpr0(&math[1], &num_dwords, N);
 429 }
 430
 431 void
 432 iris_math_div32_gpr0(struct iris_context *ice,
 433                      struct iris_batch *batch,
 434                      uint32_t D)
 435 {
 436    /* Zero out the top of GPR0 */
 437    emit_lri32(batch, CS_GPR(0) + 4, 0);
 438
 439    if (D == 0) {
 440       /* This invalid, but we should do something so we set GPR0 to 0. */
 441       emit_lri32(batch, CS_GPR(0), 0);
 442    } else if (util_is_power_of_two_or_zero(D)) {
 443       unsigned log2_D = util_logbase2(D);
 444       assert(log2_D < 32);
 445       /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking
 446        * the top 32 bits of the result.
 447        */
 448       emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D);
 449       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 450       emit_lri32(batch, CS_GPR(0) + 4, 0);
 451    } else {
 452       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
 453       assert(m.multiplier <= UINT32_MAX);
 454
 455       if (m.pre_shift) {
 456          /* We right-shift by L by left-shifting by 32 - l and taking the top
 457           * 32 bits of the result.
 458           */
 459          if (m.pre_shift < 32)
 460             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift);
 461          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 462          emit_lri32(batch, CS_GPR(0) + 4, 0);
 463       }
 464
 465       /* Do the 32x32 multiply into gpr0 */
 466       emit_mul_gpr0(batch, m.multiplier);
 467
 468       if (m.increment) {
 469          /* If we need to increment, save off a copy of GPR0 */
 470          emit_lri32(batch, CS_GPR(1) + 0, m.multiplier);
 471          emit_lri32(batch, CS_GPR(1) + 4, 0);
 472          emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1);
 473       }
 474
 475       /* Shift by 32 */
 476       emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 477       emit_lri32(batch, CS_GPR(0) + 4, 0);
 478
 479       if (m.post_shift) {
 480          /* We right-shift by L by left-shifting by 32 - l and taking the top
 481           * 32 bits of the result.
 482           */
 483          if (m.post_shift < 32)
 484             emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift);
 485          emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4);
 486          emit_lri32(batch, CS_GPR(0) + 4, 0);
 487       }
 488    }
 489 }
 490
 491 /*
 492  * GPR0 = (GPR0 == 0) ? 0 : 1;
 493  */
 494 static void
 495 gpr0_to_bool(struct iris_context *ice)
 496 {
 497    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 498
 499    ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
 500
 501    static const uint32_t math[] = {
 502       MI_MATH | (9 - 2),
 503       MI_ALU2(LOAD, SRCA, R0),
 504       MI_ALU1(LOAD0, SRCB),
 505       MI_ALU0(ADD),
 506       MI_ALU2(STOREINV, R0, ZF),
 507       MI_ALU2(LOAD, SRCA, R0),
 508       MI_ALU2(LOAD, SRCB, R1),
 509       MI_ALU0(AND),
 510       MI_ALU2(STORE, R0, ACCU),
 511    };
 512    iris_batch_emit(batch, math, sizeof(math));
 513 }
 514
 515 static void
 516 load_overflow_data_to_cs_gprs(struct iris_context *ice,
 517                               struct iris_query *q,
 518                               int idx)
 519 {
 520    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 521
 522    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 523                                  offsetof(struct iris_query_so_overflow,
 524                                           stream[idx].prim_storage_needed[0]));
 525    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 526                                  offsetof(struct iris_query_so_overflow,
 527                                           stream[idx].prim_storage_needed[1]));
 528
 529    ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
 530                                  offsetof(struct iris_query_so_overflow,
 531                                           stream[idx].num_prims[0]));
 532    ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
 533                                  offsetof(struct iris_query_so_overflow,
 534                                           stream[idx].num_prims[1]));
 535 }
 536
 537 /*
 538  * R3 = R4 - R3;
 539  * R1 = R2 - R1;
 540  * R1 = R3 - R1;
 541  * R0 = R0 | R1;
 542  */
 543 static void
 544 calc_overflow_for_stream(struct iris_context *ice)
 545 {
 546    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 547    static const uint32_t maths[] = {
 548       MI_MATH | (17 - 2),
 549       MI_ALU2(LOAD, SRCA, R4),
 550       MI_ALU2(LOAD, SRCB, R3),
 551       MI_ALU0(SUB),
 552       MI_ALU2(STORE, R3, ACCU),
 553       MI_ALU2(LOAD, SRCA, R2),
 554       MI_ALU2(LOAD, SRCB, R1),
 555       MI_ALU0(SUB),
 556       MI_ALU2(STORE, R1, ACCU),
 557       MI_ALU2(LOAD, SRCA, R3),
 558       MI_ALU2(LOAD, SRCB, R1),
 559       MI_ALU0(SUB),
 560       MI_ALU2(STORE, R1, ACCU),
 561       MI_ALU2(LOAD, SRCA, R1),
 562       MI_ALU2(LOAD, SRCB, R0),
 563       MI_ALU0(OR),
 564       MI_ALU2(STORE, R0, ACCU),
 565    };
 566
 567    iris_batch_emit(batch, maths, sizeof(maths));
 568 }
 569
 570 static void
 571 overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
 572 {
 573    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 574
 575    ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
 576
 577    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
 578       load_overflow_data_to_cs_gprs(ice, q, q->index);
 579       calc_overflow_for_stream(ice);
 580    } else {
 581       for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
 582          load_overflow_data_to_cs_gprs(ice, q, i);
 583          calc_overflow_for_stream(ice);
 584       }
 585    }
 586
 587    gpr0_to_bool(ice);
 588 }
 589
 590 /*
 591  * GPR0 = GPR0 & ((1ull << n) -1);
 592  */
 593 static void
 594 keep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n)
 595 {
 596    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 597
 598    ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 599    static const uint32_t math[] = {
 600       MI_MATH | (5 - 2),
 601       MI_ALU2(LOAD, SRCA, R0),
 602       MI_ALU2(LOAD, SRCB, R1),
 603       MI_ALU0(AND),
 604       MI_ALU2(STORE, R0, ACCU),
 605    };
 606    iris_batch_emit(batch, math, sizeof(math));
 607 }
 608
 609 /**
 610  * Calculate the result and store it to CS_GPR0.
 611  */
 612 static void
 613 calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
 614 {
 615    struct iris_batch *batch = &ice->batches[q->batch_idx];
 616    struct iris_screen *screen = (void *) ice->ctx.screen;
 617    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 618        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
 619       overflow_result_to_gpr0(ice, q);
 620       return;
 621    }
 622
 623    if (q->type == PIPE_QUERY_TIMESTAMP) {
 624       ice->vtbl.load_register_mem64(batch, CS_GPR(0), q->bo,
 625                                     offsetof(struct iris_query_snapshots, start));
 626       /* TODO: This discards any fractional bits of the timebase scale.
 627        * We would need to do a bit of fixed point math on the CS ALU, or
 628        * launch an actual shader to calculate this with full precision.
 629        */
 630       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 631       keep_gpr0_lower_n_bits(ice, 36);
 632       return;
 633    }
 634
 635    ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
 636                                  offsetof(struct iris_query_snapshots, start));
 637    ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
 638                                  offsetof(struct iris_query_snapshots, end));
 639
 640    static const uint32_t math[] = {
 641       MI_MATH | (5 - 2),
 642       MI_ALU2(LOAD, SRCA, R2),
 643       MI_ALU2(LOAD, SRCB, R1),
 644       MI_ALU0(SUB),
 645       MI_ALU2(STORE, R0, ACCU),
 646    };
 647    iris_batch_emit(batch, math, sizeof(math));
 648
 649    if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
 650        q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
 651       gpr0_to_bool(ice);
 652
 653    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
 654       /* TODO: This discards fractional bits (see above). */
 655       emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency));
 656    }
 657 }
 658
 659 static struct pipe_query *
 660 iris_create_query(struct pipe_context *ctx,
 661                   unsigned query_type,
 662                   unsigned index)
 663 {
 664    struct iris_query *q = calloc(1, sizeof(struct iris_query));
 665
 666    q->type = query_type;
 667    q->index = index;
 668
 669    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
 670       q->batch_idx = IRIS_BATCH_COMPUTE;
 671    else
 672       q->batch_idx = IRIS_BATCH_RENDER;
 673    return (struct pipe_query *) q;
 674 }
 675
 676 static void
 677 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
 678 {
 679    struct iris_query *query = (void *) p_query;
 680    iris_bo_unreference(query->bo);
 681    free(query);
 682 }
 683
 684
 685 static boolean
 686 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 687 {
 688    struct iris_screen *screen = (void *) ctx->screen;
 689    struct iris_context *ice = (void *) ctx;
 690    struct iris_query *q = (void *) query;
 691
 692    iris_bo_unreference(q->bo);
 693    q->bo = iris_bo_alloc(screen->bufmgr, "query object", 4096,
 694                          IRIS_MEMZONE_OTHER);
 695    if (!q->bo)
 696       return false;
 697
 698    q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
 699    if (!q->map)
 700       return false;
 701
 702    q->result = 0ull;
 703    q->ready = false;
 704    q->map->snapshots_landed = false;
 705
 706    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 707       ice->state.prims_generated_query_active = true;
 708       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 709    }
 710
 711    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 712        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 713       write_overflow_values(ice, q, false);
 714    else
 715       write_value(ice, q, offsetof(struct iris_query_snapshots, start));
 716
 717    return true;
 718 }
 719
 720 static bool
 721 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
 722 {
 723    struct iris_context *ice = (void *) ctx;
 724    struct iris_query *q = (void *) query;
 725
 726    if (q->type == PIPE_QUERY_TIMESTAMP) {
 727       iris_begin_query(ctx, query);
 728       mark_available(ice, q);
 729       return true;
 730    }
 731
 732    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
 733       ice->state.prims_generated_query_active = false;
 734       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
 735    }
 736
 737    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
 738        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
 739       write_overflow_values(ice, q, true);
 740    else
 741       write_value(ice, q, offsetof(struct iris_query_snapshots, end));
 742    mark_available(ice, q);
 743
 744    return true;
 745 }
 746
 747 /**
 748  * See if the snapshots have landed for a query, and if so, compute the
 749  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
 750  */
 751 static void
 752 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
 753 {
 754    struct iris_screen *screen = (void *) ice->ctx.screen;
 755    const struct gen_device_info *devinfo = &screen->devinfo;
 756
 757    if (!q->ready && q->map->snapshots_landed) {
 758       calculate_result_on_cpu(devinfo, q);
 759    }
 760 }
 761
 762 static boolean
 763 iris_get_query_result(struct pipe_context *ctx,
 764                       struct pipe_query *query,
 765                       boolean wait,
 766                       union pipe_query_result *result)
 767 {
 768    struct iris_context *ice = (void *) ctx;
 769    struct iris_query *q = (void *) query;
 770    struct iris_screen *screen = (void *) ctx->screen;
 771    const struct gen_device_info *devinfo = &screen->devinfo;
 772
 773    if (!q->ready) {
 774       if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
 775          iris_batch_flush(&ice->batches[q->batch_idx]);
 776
 777       if (!q->map->snapshots_landed) {
 778          if (wait)
 779             iris_bo_wait_rendering(q->bo);
 780          else
 781             return false;
 782       }
 783
 784       assert(q->map->snapshots_landed);
 785       calculate_result_on_cpu(devinfo, q);
 786    }
 787
 788    assert(q->ready);
 789
 790    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
 791       switch (q->index) {
 792       case 0:
 793          result->pipeline_statistics.ia_vertices = q->result;
 794          break;
 795       case 1:
 796          result->pipeline_statistics.ia_primitives = q->result;
 797          break;
 798       case 2:
 799          result->pipeline_statistics.vs_invocations = q->result;
 800          break;
 801       case 3:
 802          result->pipeline_statistics.gs_invocations = q->result;
 803          break;
 804       case 4:
 805          result->pipeline_statistics.gs_primitives = q->result;
 806          break;
 807       case 5:
 808          result->pipeline_statistics.c_invocations = q->result;
 809          break;
 810       case 6:
 811          result->pipeline_statistics.c_primitives = q->result;
 812          break;
 813       case 7:
 814          result->pipeline_statistics.ps_invocations = q->result;
 815          break;
 816       case 8:
 817          result->pipeline_statistics.hs_invocations = q->result;
 818          break;
 819       case 9:
 820          result->pipeline_statistics.ds_invocations = q->result;
 821          break;
 822       case 10:
 823          result->pipeline_statistics.cs_invocations = q->result;
 824          break;
 825       }
 826    } else {
 827       result->u64 = q->result;
 828    }
 829
 830    return true;
 831 }
 832
 833 static void
 834 iris_get_query_result_resource(struct pipe_context *ctx,
 835                                struct pipe_query *query,
 836                                boolean wait,
 837                                enum pipe_query_value_type result_type,
 838                                int index,
 839                                struct pipe_resource *p_res,
 840                                unsigned offset)
 841 {
 842    struct iris_context *ice = (void *) ctx;
 843    struct iris_query *q = (void *) query;
 844    struct iris_batch *batch = &ice->batches[q->batch_idx];
 845    const struct gen_device_info *devinfo = &batch->screen->devinfo;
 846    struct iris_resource *res = (void *) p_res;
 847    unsigned snapshots_landed_offset =
 848       offsetof(struct iris_query_snapshots, snapshots_landed);
 849
 850    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
 851
 852    if (index == -1) {
 853       /* They're asking for the availability of the result.  If we still
 854        * have commands queued up which produce the result, submit them
 855        * now so that progress happens.  Either way, copy the snapshots
 856        * landed field to the destination resource.
 857        */
 858       if (iris_batch_references(batch, q->bo))
 859          iris_batch_flush(batch);
 860
 861       ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset,
 862                              q->bo, snapshots_landed_offset,
 863                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
 864       return;
 865    }
 866
 867    if (!q->ready && q->map->snapshots_landed) {
 868       /* The final snapshots happen to have landed, so let's just compute
 869        * the result on the CPU now...
 870        */
 871       calculate_result_on_cpu(devinfo, q);
 872    }
 873
 874    if (q->ready) {
 875       /* We happen to have the result on the CPU, so just copy it. */
 876       if (result_type <= PIPE_QUERY_TYPE_U32) {
 877          ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset,
 878                                     q->result);
 879       } else {
 880          ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset,
 881                                     q->result);
 882       }
 883
 884       /* Make sure the result lands before they use bind the QBO elsewhere
 885        * and use the result.
 886        */
 887       // XXX: Why?  i965 doesn't do this.
 888       iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL);
 889       return;
 890    }
 891
 892    /* Calculate the result to CS_GPR0 */
 893    calculate_result_on_gpu(ice, q);
 894
 895    bool predicated = !wait && !q->stalled;
 896
 897    if (predicated) {
 898       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 899       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 900                                     snapshots_landed_offset);
 901       uint32_t predicate = MI_PREDICATE |
 902                            MI_PREDICATE_LOADOP_LOADINV |
 903                            MI_PREDICATE_COMBINEOP_SET |
 904                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
 905       iris_batch_emit(batch, &predicate, sizeof(uint32_t));
 906    }
 907
 908    if (result_type <= PIPE_QUERY_TYPE_U32) {
 909       ice->vtbl.store_register_mem32(batch, CS_GPR(0),
 910                                      iris_resource_bo(p_res),
 911                                      offset, predicated);
 912    } else {
 913       ice->vtbl.store_register_mem64(batch, CS_GPR(0),
 914                                      iris_resource_bo(p_res),
 915                                      offset, predicated);
 916    }
 917 }
 918
 919 static void
 920 iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
 921 {
 922    struct iris_context *ice = (void *) ctx;
 923
 924    if (ice->state.statistics_counters_enabled == enable)
 925       return;
 926
 927    // XXX: most packets aren't paying attention to this yet, because it'd
 928    // have to be done dynamically at draw time, which is a pain
 929    ice->state.statistics_counters_enabled = enable;
 930    ice->state.dirty |= IRIS_DIRTY_CLIP |
 931                        IRIS_DIRTY_GS |
 932                        IRIS_DIRTY_RASTER |
 933                        IRIS_DIRTY_STREAMOUT |
 934                        IRIS_DIRTY_TCS |
 935                        IRIS_DIRTY_TES |
 936                        IRIS_DIRTY_VS |
 937                        IRIS_DIRTY_WM;
 938 }
 939
 940 static void
 941 set_predicate_enable(struct iris_context *ice, bool value)
 942 {
 943    if (value)
 944       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
 945    else
 946       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
 947 }
 948
 949 static void
 950 set_predicate_for_result(struct iris_context *ice,
 951                          struct iris_query *q,
 952                          bool inverted)
 953 {
 954    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
 955
 956    /* The CPU doesn't have the query result yet; use hardware predication */
 957    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
 958
 959    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
 960    iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
 961    q->stalled = true;
 962
 963    switch (q->type) {
 964    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 965    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 966       overflow_result_to_gpr0(ice, q);
 967
 968       ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0));
 969       ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
 970       break;
 971    default:
 972       /* PIPE_QUERY_OCCLUSION_* */
 973       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo,
 974          offsetof(struct iris_query_snapshots, start));
 975       ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo,
 976          offsetof(struct iris_query_snapshots, end));
 977       break;
 978    }
 979
 980    uint32_t mi_predicate = MI_PREDICATE |
 981                            MI_PREDICATE_COMBINEOP_SET |
 982                            MI_PREDICATE_COMPAREOP_SRCS_EQUAL |
 983                            (inverted ? MI_PREDICATE_LOADOP_LOAD
 984                                      : MI_PREDICATE_LOADOP_LOADINV);
 985    iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
 986
 987    /* We immediately set the predicate on the render batch, as all the
 988     * counters come from 3D operations.  However, we may need to predicate
 989     * a compute dispatch, which executes in a different GEM context and has
 990     * a different MI_PREDICATE_DATA register.  So, we save the result to
 991     * memory and reload it in iris_launch_grid.
 992     */
 993    unsigned offset = offsetof(struct iris_query_snapshots, predicate_data);
 994    ice->vtbl.store_register_mem64(batch, MI_PREDICATE_DATA,
 995                                   q->bo, offset, false);
 996    ice->state.compute_predicate = q->bo;
 997 }
 998
 999 static void
1000 iris_render_condition(struct pipe_context *ctx,
1001                       struct pipe_query *query,
1002                       boolean condition,
1003                       enum pipe_render_cond_flag mode)
1004 {
1005    struct iris_context *ice = (void *) ctx;
1006    struct iris_query *q = (void *) query;
1007
1008    if (!q) {
1009       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
1010       return;
1011    }
1012
1013    iris_check_query_no_flush(ice, q);
1014
1015    if (q->result || q->ready) {
1016       set_predicate_enable(ice, (q->result != 0) ^ condition);
1017    } else {
1018       if (mode == PIPE_RENDER_COND_NO_WAIT ||
1019           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
1020          perf_debug(&ice->dbg, "Conditional rendering demoted from "
1021                     "\"no wait\" to \"wait\".");
1022       }
1023       set_predicate_for_result(ice, q, condition);
1024    }
1025 }
1026
1027 void
1028 iris_init_query_functions(struct pipe_context *ctx)
1029 {
1030    ctx->create_query = iris_create_query;
1031    ctx->destroy_query = iris_destroy_query;
1032    ctx->begin_query = iris_begin_query;
1033    ctx->end_query = iris_end_query;
1034    ctx->get_query_result = iris_get_query_result;
1035    ctx->get_query_result_resource = iris_get_query_result_resource;
1036    ctx->set_active_query_state = iris_set_active_query_state;
1037    ctx->render_condition = iris_render_condition;
1038 }