/**
* @file iris_query.c
*
- * XXX: this file is EMPTY. it will eventually implement query objects!
+ * Query object support. This allows measuring various simple statistics
+ * via counters on the GPU.
*/
#include <stdio.h>
#include "iris_resource.h"
#include "iris_screen.h"
+#define IA_VERTICES_COUNT 0x2310
+#define IA_PRIMITIVES_COUNT 0x2318
+#define VS_INVOCATION_COUNT 0x2320
+#define HS_INVOCATION_COUNT 0x2300
+#define DS_INVOCATION_COUNT 0x2308
+#define GS_INVOCATION_COUNT 0x2328
+#define GS_PRIMITIVES_COUNT 0x2330
+#define CL_INVOCATION_COUNT 0x2338
+#define CL_PRIMITIVES_COUNT 0x2340
+#define PS_INVOCATION_COUNT 0x2348
+#define CS_INVOCATION_COUNT 0x2290
+#define PS_DEPTH_COUNT 0x2350
+
+#define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8)
+
+#define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
+
#define CS_GPR(n) (0x2600 + (n) * 8)
#define MI_MATH (0x1a << 23)
struct iris_query {
enum pipe_query_type type;
+ int index;
bool ready;
struct iris_bo *bo;
struct iris_query_snapshots *map;
+
+ int batch_idx;
};
struct iris_query_snapshots {
+ uint64_t snapshots_landed;
uint64_t start;
uint64_t end;
+};
+
+struct iris_query_so_overflow {
uint64_t snapshots_landed;
+ struct {
+ uint64_t prim_storage_needed[2];
+ uint64_t num_prims[2];
+ } stream[4];
};
/**
}
static void
-write_availability(struct iris_context *ice,
- struct iris_query *q,
- bool available)
+mark_available(struct iris_context *ice, struct iris_query *q)
{
- struct iris_batch *batch = &ice->render_batch;
+ struct iris_batch *batch = &ice->batches[q->batch_idx];
unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
if (!iris_is_query_pipelined(q)) {
- ice->vtbl.store_data_imm64(batch, q->bo, offset, available);
+ ice->vtbl.store_data_imm64(batch, q->bo, offset, true);
} else {
- if (available) {
- /* Order available *after* the query results. */
- flags |= PIPE_CONTROL_FLUSH_ENABLE;
- } else {
- /* Make it unavailable *before* any pipelined reads. */
- flags |= PIPE_CONTROL_CS_STALL;
- }
- iris_emit_pipe_control_write(batch, flags, q->bo, offset, available);
+ /* Order available *after* the query results. */
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ iris_emit_pipe_control_write(batch, flags, q->bo, offset, true);
}
}
static void
write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
{
- struct iris_batch *batch = &ice->render_batch;
+ struct iris_batch *batch = &ice->batches[q->batch_idx];
const struct gen_device_info *devinfo = &batch->screen->devinfo;
- iris_use_pinned_bo(batch, q->bo, true);
+ if (!iris_is_query_pipelined(q)) {
+ iris_emit_pipe_control_flush(batch,
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ }
switch (q->type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
*/
iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL);
}
- iris_pipelined_write(&ice->render_batch, q,
+ iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
PIPE_CONTROL_WRITE_DEPTH_COUNT |
PIPE_CONTROL_DEPTH_STALL,
offset);
break;
case PIPE_QUERY_TIME_ELAPSED:
- iris_pipelined_write(&ice->render_batch, q,
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
PIPE_CONTROL_WRITE_TIMESTAMP,
offset);
break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ ice->vtbl.store_register_mem64(batch,
+ q->index == 0 ? CL_INVOCATION_COUNT :
+ SO_PRIM_STORAGE_NEEDED(q->index),
+ q->bo, offset, false);
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ ice->vtbl.store_register_mem64(batch,
+ SO_NUM_PRIMS_WRITTEN(q->index),
+ q->bo, offset, false);
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS: {
+ static const uint32_t index_to_reg[] = {
+ IA_VERTICES_COUNT,
+ IA_PRIMITIVES_COUNT,
+ VS_INVOCATION_COUNT,
+ GS_INVOCATION_COUNT,
+ GS_PRIMITIVES_COUNT,
+ CL_INVOCATION_COUNT,
+ CL_PRIMITIVES_COUNT,
+ PS_INVOCATION_COUNT,
+ HS_INVOCATION_COUNT,
+ DS_INVOCATION_COUNT,
+ CS_INVOCATION_COUNT,
+ };
+ const uint32_t reg = index_to_reg[q->index];
+
+ ice->vtbl.store_register_mem64(batch, reg, q->bo, offset, false);
+ break;
+ }
default:
assert(false);
}
}
static void
-calculate_result_on_cpu(struct iris_query *q)
+write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+ uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
+
+ iris_emit_pipe_control_flush(batch,
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ for (uint32_t i = 0; i < count; i++) {
+ int s = q->index + i;
+ int g_idx = offsetof(struct iris_query_so_overflow,
+ stream[s].num_prims[end]);
+ int w_idx = offsetof(struct iris_query_so_overflow,
+ stream[s].prim_storage_needed[end]);
+ ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
+ q->bo, g_idx, false);
+ ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
+ q->bo, w_idx, false);
+ }
+}
+
+uint64_t
+iris_timebase_scale(const struct gen_device_info *devinfo,
+ uint64_t gpu_timestamp)
+{
+ return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency;
+}
+
+static uint64_t
+iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
+{
+ if (time0 > time1) {
+ return (1ULL << TIMESTAMP_BITS) + time1 - time0;
+ } else {
+ return time1 - time0;
+ }
+}
+
+static bool
+stream_overflowed(struct iris_query_so_overflow *so, int s)
+{
+ return (so->stream[s].prim_storage_needed[1] -
+ so->stream[s].prim_storage_needed[0]) !=
+ (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
+}
+
+static void
+calculate_result_on_cpu(const struct gen_device_info *devinfo,
+ struct iris_query *q)
{
switch (q->type) {
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
q->result = q->map->end != q->map->start;
break;
- case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ /* The timestamp is the single starting snapshot. */
+ q->result = iris_timebase_scale(devinfo, q->map->start);
+ q->result &= (1ull << TIMESTAMP_BITS) - 1;
+ break;
case PIPE_QUERY_TIME_ELAPSED:
+ q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
+ q->result = iris_timebase_scale(devinfo, q->result);
+ q->result &= (1ull << TIMESTAMP_BITS) - 1;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ q->result = stream_overflowed((void *) q->map, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ q->result = false;
+ for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+ q->result |= stream_overflowed((void *) q->map, i);
+ break;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PIPELINE_STATISTICS:
default:
q->result = q->map->end - q->map->start;
break;
static void
gpr0_to_bool(struct iris_context *ice)
{
- struct iris_batch *batch = &ice->render_batch;
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull);
iris_batch_emit(batch, math, sizeof(math));
}
+static void
+load_overflow_data_to_cs_gprs(struct iris_context *ice,
+ struct iris_query *q,
+ int idx)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+
+ ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
+ offsetof(struct iris_query_so_overflow,
+ stream[idx].prim_storage_needed[0]));
+ ice->vtbl.load_register_mem64(batch, CS_GPR(2), q->bo,
+ offsetof(struct iris_query_so_overflow,
+ stream[idx].prim_storage_needed[1]));
+
+ ice->vtbl.load_register_mem64(batch, CS_GPR(3), q->bo,
+ offsetof(struct iris_query_so_overflow,
+ stream[idx].num_prims[0]));
+ ice->vtbl.load_register_mem64(batch, CS_GPR(4), q->bo,
+ offsetof(struct iris_query_so_overflow,
+ stream[idx].num_prims[1]));
+}
+
+/*
+ * R3 = R4 - R3;
+ * R1 = R2 - R1;
+ * R1 = R3 - R1;
+ * R0 = R0 | R1;
+ */
+static void
+calc_overflow_for_stream(struct iris_context *ice)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+ static const uint32_t maths[] = {
+ MI_MATH | (17 - 2),
+ MI_ALU2(LOAD, SRCA, R4),
+ MI_ALU2(LOAD, SRCB, R3),
+ MI_ALU0(SUB),
+ MI_ALU2(STORE, R3, ACCU),
+ MI_ALU2(LOAD, SRCA, R2),
+ MI_ALU2(LOAD, SRCB, R1),
+ MI_ALU0(SUB),
+ MI_ALU2(STORE, R1, ACCU),
+ MI_ALU2(LOAD, SRCA, R3),
+ MI_ALU2(LOAD, SRCB, R1),
+ MI_ALU0(SUB),
+ MI_ALU2(STORE, R1, ACCU),
+ MI_ALU2(LOAD, SRCA, R1),
+ MI_ALU2(LOAD, SRCB, R0),
+ MI_ALU0(OR),
+ MI_ALU2(STORE, R0, ACCU),
+ };
+
+ iris_batch_emit(batch, maths, sizeof(maths));
+}
+
+static void
+overflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+
+ ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull);
+
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
+ load_overflow_data_to_cs_gprs(ice, q, q->index);
+ calc_overflow_for_stream(ice);
+ } else {
+ for (int i = 0; i < MAX_VERTEX_STREAMS; i++) {
+ load_overflow_data_to_cs_gprs(ice, q, i);
+ calc_overflow_for_stream(ice);
+ }
+ }
+
+ gpr0_to_bool(ice);
+}
+
/**
* Calculate the result and store it to CS_GPR0.
*/
static void
calculate_result_on_gpu(struct iris_context *ice, struct iris_query *q)
{
- struct iris_batch *batch = &ice->render_batch;
+ struct iris_batch *batch = &ice->batches[q->batch_idx];
+
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ overflow_result_to_gpr0(ice, q);
+ return;
+ }
ice->vtbl.load_register_mem64(batch, CS_GPR(1), q->bo,
offsetof(struct iris_query_snapshots, start));
struct iris_query *q = calloc(1, sizeof(struct iris_query));
q->type = query_type;
+ q->index = index;
+ if (q->type == PIPE_QUERY_PIPELINE_STATISTICS && q->index == 10)
+ q->batch_idx = IRIS_BATCH_COMPUTE;
+ else
+ q->batch_idx = IRIS_BATCH_RENDER;
return (struct pipe_query *) q;
}
if (!q->bo)
return false;
- q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_ASYNC);
+ q->map = iris_bo_map(&ice->dbg, q->bo, MAP_READ | MAP_WRITE | MAP_ASYNC);
if (!q->map)
return false;
q->result = 0ull;
q->ready = false;
+ q->map->snapshots_landed = false;
+
+ if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+ ice->state.prims_generated_query_active = true;
+ ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
+ }
- write_availability(ice, q, false);
- write_value(ice, q, offsetof(struct iris_query_snapshots, start));
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ write_overflow_values(ice, q, false);
+ else
+ write_value(ice, q, offsetof(struct iris_query_snapshots, start));
return true;
}
struct iris_context *ice = (void *) ctx;
struct iris_query *q = (void *) query;
- write_value(ice, q, offsetof(struct iris_query_snapshots, end));
- write_availability(ice, q, true);
+ if (q->type == PIPE_QUERY_TIMESTAMP) {
+ iris_begin_query(ctx, query);
+ mark_available(ice, q);
+ return true;
+ }
+
+ if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+ ice->state.prims_generated_query_active = true;
+ ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
+ }
+
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ write_overflow_values(ice, q, true);
+ else
+ write_value(ice, q, offsetof(struct iris_query_snapshots, end));
+ mark_available(ice, q);
return true;
}
{
struct iris_context *ice = (void *) ctx;
struct iris_query *q = (void *) query;
+ struct iris_screen *screen = (void *) ctx->screen;
+ const struct gen_device_info *devinfo = &screen->devinfo;
if (!q->ready) {
- if (iris_batch_references(&ice->render_batch, q->bo))
- iris_batch_flush(&ice->render_batch);
+ if (iris_batch_references(&ice->batches[q->batch_idx], q->bo))
+ iris_batch_flush(&ice->batches[q->batch_idx]);
if (!q->map->snapshots_landed) {
if (wait)
}
assert(q->map->snapshots_landed);
- calculate_result_on_cpu(q);
+ calculate_result_on_cpu(devinfo, q);
}
assert(q->ready);
- result->u64 = q->result;
+
+ if (q->type == PIPE_QUERY_PIPELINE_STATISTICS) {
+ switch (q->index) {
+ case 0:
+ result->pipeline_statistics.ia_vertices = q->result;
+ break;
+ case 1:
+ result->pipeline_statistics.ia_primitives = q->result;
+ break;
+ case 2:
+ result->pipeline_statistics.vs_invocations = q->result;
+ break;
+ case 3:
+ result->pipeline_statistics.gs_invocations = q->result;
+ break;
+ case 4:
+ result->pipeline_statistics.gs_primitives = q->result;
+ break;
+ case 5:
+ result->pipeline_statistics.c_invocations = q->result;
+ break;
+ case 6:
+ result->pipeline_statistics.c_primitives = q->result;
+ break;
+ case 7:
+ result->pipeline_statistics.ps_invocations = q->result;
+ break;
+ case 8:
+ result->pipeline_statistics.hs_invocations = q->result;
+ break;
+ case 9:
+ result->pipeline_statistics.ds_invocations = q->result;
+ break;
+ case 10:
+ result->pipeline_statistics.cs_invocations = q->result;
+ break;
+ }
+ } else {
+ result->u64 = q->result;
+ }
return true;
}
{
struct iris_context *ice = (void *) ctx;
struct iris_query *q = (void *) query;
- struct iris_batch *batch = &ice->render_batch;
+ struct iris_batch *batch = &ice->batches[q->batch_idx];
+ const struct gen_device_info *devinfo = &batch->screen->devinfo;
+ struct iris_resource *res = (void *) p_res;
unsigned snapshots_landed_offset =
offsetof(struct iris_query_snapshots, snapshots_landed);
+ res->bind_history |= PIPE_BIND_QUERY_BUFFER;
+
if (index == -1) {
/* They're asking for the availability of the result. If we still
* have commands queued up which produce the result, submit them
/* The final snapshots happen to have landed, so let's just compute
* the result on the CPU now...
*/
- calculate_result_on_cpu(q);
+ calculate_result_on_cpu(devinfo, q);
}
if (q->ready) {
}
static void
-iris_set_active_query_state(struct pipe_context *pipe, boolean enable)
+iris_set_active_query_state(struct pipe_context *ctx, boolean enable)
+{
+ struct iris_context *ice = (void *) ctx;
+
+ if (ice->state.statistics_counters_enabled == enable)
+ return;
+
+ // XXX: most packets aren't paying attention to this yet, because it'd
+ // have to be done dynamically at draw time, which is a pain
+ ice->state.statistics_counters_enabled = enable;
+ ice->state.dirty |= IRIS_DIRTY_CLIP |
+ IRIS_DIRTY_GS |
+ IRIS_DIRTY_RASTER |
+ IRIS_DIRTY_STREAMOUT |
+ IRIS_DIRTY_TCS |
+ IRIS_DIRTY_TES |
+ IRIS_DIRTY_VS |
+ IRIS_DIRTY_WM;
+}
+
+static void
+set_predicate_enable(struct iris_context *ice,
+ bool value)
+{
+ if (value)
+ ice->predicate = IRIS_PREDICATE_STATE_RENDER;
+ else
+ ice->predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
+}
+
+static void
+set_predicate_for_overflow(struct iris_context *ice,
+ struct iris_query *q)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+ ice->predicate = IRIS_PREDICATE_STATE_USE_BIT;
+
+ /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+ * command when loading the values into the predicate source registers for
+ * conditional rendering.
+ */
+ iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
+
+ overflow_result_to_gpr0(ice, q);
+ ice->vtbl.load_register_reg64(batch, CS_GPR(0), MI_PREDICATE_SRC0);
+ ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull);
+}
+
+static void
+set_predicate_for_occlusion(struct iris_context *ice,
+ struct iris_query *q)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+ ice->predicate = IRIS_PREDICATE_STATE_USE_BIT;
+
+ /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+ * command when loading the values into the predicate source registers for
+ * conditional rendering.
+ */
+ iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE);
+
+ ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, q->bo, offsetof(struct iris_query_snapshots, start));
+ ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, q->bo, offsetof(struct iris_query_snapshots, end));
+}
+
+static void
+set_predicate_for_result(struct iris_context *ice,
+ struct iris_query *q,
+ bool condition)
+{
+ struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+ int load_op;
+
+ switch (q->type) {
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ set_predicate_for_overflow(ice, q);
+ break;
+ default:
+ set_predicate_for_occlusion(ice, q);
+ break;
+ }
+
+ if (ice->predicate == IRIS_PREDICATE_STATE_USE_BIT) {
+ if (condition)
+ load_op = MI_PREDICATE_LOADOP_LOAD;
+ else
+ load_op = MI_PREDICATE_LOADOP_LOADINV;
+
+ // batch emit
+ uint32_t predicate = MI_PREDICATE | load_op |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+ iris_batch_emit(batch, &predicate, sizeof(uint32_t));
+ }
+}
+
+static void
+iris_render_condition(struct pipe_context *ctx,
+ struct pipe_query *query,
+ boolean condition,
+ enum pipe_render_cond_flag mode)
{
- /* Do nothing, intentionally - only u_blitter uses this. */
+ struct iris_context *ice = (void *) ctx;
+ struct iris_query *q = (void *) query;
+
+ if (!q) {
+ ice->predicate = IRIS_PREDICATE_STATE_RENDER;
+ return;
+ }
+
+ if (q->result || q->ready)
+ set_predicate_enable(ice, (q->result != 0) ^ condition);
+ else
+ set_predicate_for_result(ice, q, condition);
}
void
ctx->get_query_result = iris_get_query_result;
ctx->get_query_result_resource = iris_get_query_result_resource;
ctx->set_active_query_state = iris_set_active_query_state;
+ ctx->render_condition = iris_render_condition;
}