From: Kenneth Graunke Date: Thu, 16 May 2013 15:54:47 +0000 (-0700) Subject: i965: Implement transform feedback query support in hardware on Gen6+. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f09b91f78247409f54c975f56cb10d5f350fe64e;p=mesa.git i965: Implement transform feedback query support in hardware on Gen6+. Now that we have hardware contexts and can use MI_STORE_REGISTER_MEM, we can use the GPU's pipeline statistics counters rather than going out of our way to count primitives in software. Aside from being simpler, this also paves the way for Geometry Shaders, which can output an arbitrary number of primitives on the GPU. It will also allow us to use hardware primitive restart when these queries are in use. The GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN query is easy: it corresponds to the SO_NUM_PRIMS_WRITTEN/SO_NUM_PRIMS_WRITTEN0_IVB counters. The GL_PRIMITIVES_GENERATED query is trickier. Gen provides several statistics registers which /almost/ match the semantics required: - IA_PRIMITIVES_COUNT The number of primitives fetched by the VF or IA (input assembler). This undercounts when GS is enabled, as it can output many primitives. - GS_PRIMITIVES_COUNT The number of primitives output by the GS. Unfortunately, this doesn't increment unless the GS unit is actually enabled, and it usually isn't. - SO_PRIM_STORAGE_NEEDED*_IVB The amount of space needed to write primitives output by transform feedback. These naturally only work when transform feedback is on. We'd also have to add the counters for all four streams. - CL_INVOCATION_COUNT The number of primitives processed by the clipper. This doesn't work if the GS or SOL throw away primitives for rasterizer discard. However, it does increment even if the clipper is in REJECT_ALL mode. Dynamically switching between counters would be painfully complicated, especially since GS, rasterizer discard, and transform feedback can all be switched on and off repeatedly during a single query. The most usable counter is CL_INVOCATION_COUNT. The previous two patches reworked rasterizer discard support so that all primitives hit the clipper, making this work. v2: Occlusion query bug fixes removed and squashed in earlier patches. Signed-off-by: Kenneth Graunke Reviewed-by: Eric Anholt Reviewed-by: Paul Berry --- diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c index 3f2ed00f92d..8c38bd5e2c7 100644 --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c @@ -94,6 +94,57 @@ write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx) ADVANCE_BATCH(); } +/* + * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM. + * + * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support; other + * counters have to be read via the generic MI_STORE_REGISTER_MEM. This + * function also performs a pipeline flush for proper synchronization. + */ +static void +write_reg(struct intel_context *intel, + drm_intel_bo *query_bo, uint32_t reg, int idx) +{ + assert(intel->gen >= 6); + + intel_batchbuffer_emit_mi_flush(intel); + + /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to + * read a full 64-bit register, we need to do two of them. + */ + BEGIN_BATCH(3); + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); + OUT_BATCH(reg); + OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, + idx * sizeof(uint64_t)); + ADVANCE_BATCH(); + + BEGIN_BATCH(3); + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); + OUT_BATCH(reg + sizeof(uint32_t)); + OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, + sizeof(uint32_t) + idx * sizeof(uint64_t)); + ADVANCE_BATCH(); +} + +static void +write_primitives_generated(struct intel_context *intel, + drm_intel_bo *query_bo, int idx) +{ + write_reg(intel, query_bo, CL_INVOCATION_COUNT, idx); +} + +static void +write_xfb_primitives_written(struct intel_context *intel, + drm_intel_bo *query_bo, int idx) +{ + if (intel->gen >= 7) { + write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN0_IVB, idx); + } else { + write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN, idx); + } +} + /** * Wait on the query object's BO and calculate the final result. */ @@ -167,10 +218,7 @@ gen6_queryobj_get_results(struct gl_context *ctx, case GL_PRIMITIVES_GENERATED: case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - /* We don't actually query the hardware for this value, so query->bo - * should always be NULL and execution should never reach here. - */ - assert(!"Unreachable"); + query->Base.Result = results[1] - results[0]; break; default: @@ -195,10 +243,13 @@ gen6_queryobj_get_results(struct gl_context *ctx, static void gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) { - struct brw_context *brw = brw_context(ctx); struct intel_context *intel = intel_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; + /* Since we're starting a new query, we need to throw away old results. */ + drm_intel_bo_unreference(query->bo); + query->bo = drm_intel_bo_alloc(intel->bufmgr, "query results", 4096, 4096); + switch (query->Base.Target) { case GL_TIME_ELAPSED: /* For timestamp queries, we record the starting time right away so that @@ -220,36 +271,21 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) * obtain the time elapsed. Notably, this includes time elapsed while * the system was doing other work, such as running other applications. */ - drm_intel_bo_unreference(query->bo); - query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096, 4096); write_timestamp(intel, query->bo, 0); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - /* Since we're starting a new query, we need to be sure to throw away - * any previous occlusion query results. - */ - drm_intel_bo_unreference(query->bo); - query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096, 4096); write_depth_count(intel, query->bo, 0); break; case GL_PRIMITIVES_GENERATED: - /* We don't actually query the hardware for this value; we keep track of - * it a software counter. So just reset the counter. - */ - brw->sol.primitives_generated = 0; - brw->sol.counting_primitives_generated = true; + write_primitives_generated(intel, query->bo, 0); break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - /* We don't actually query the hardware for this value; we keep track of - * it a software counter. So just reset the counter. - */ - brw->sol.primitives_written = 0; - brw->sol.counting_primitives_written = true; + write_xfb_primitives_written(intel, query->bo, 0); break; default: @@ -269,7 +305,6 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) static void gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) { - struct brw_context *brw = brw_context(ctx); struct intel_context *intel = intel_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; @@ -285,21 +320,11 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) break; case GL_PRIMITIVES_GENERATED: - /* We don't actually query the hardware for this value; we keep track of - * it in a software counter. So just read the counter and store it in - * the query object. - */ - query->Base.Result = brw->sol.primitives_generated; - brw->sol.counting_primitives_generated = false; + write_primitives_generated(intel, query->bo, 1); break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - /* We don't actually query the hardware for this value; we keep track of - * it in a software counter. So just read the counter and store it in - * the query object. - */ - query->Base.Result = brw->sol.primitives_written; - brw->sol.counting_primitives_written = false; + write_xfb_primitives_written(intel, query->bo, 1); break; default: