X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fgen6_queryobj.c;h=f36f095454c6318725daaa0ab6c5b62134f63d82;hb=2db9dd5aeb9566c8480651989981cb1169957748;hp=a3af19e15359f2e8e88089004e0fb9f684e2ad68;hpb=329779a0b45b63be17627f026533c80b2c8f7991;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c index a3af19e1535..f36f095454c 100644 --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c @@ -37,118 +37,122 @@ #include "brw_defines.h" #include "brw_state.h" #include "intel_batchbuffer.h" +#include "intel_buffer_objects.h" #include "intel_reg.h" -/** - * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. - */ -static void -write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx) +static inline void +set_query_availability(struct brw_context *brw, struct brw_query_object *query, + bool available) { - struct intel_context *intel = &brw->intel; - /* Emit workaround flushes: */ - if (intel->gen == 6) { - /* The timestamp write below is a non-zero post-sync op, which on - * Gen6 necessitates a CS stall. CS stalls need stall at scoreboard - * set. See the comments for intel_emit_post_sync_nonzero_flush(). - */ - BEGIN_BATCH(4); - OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2)); - OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); + /* For platforms that support ARB_query_buffer_object, we write the + * query availability for "pipelined" queries. + * + * Most counter snapshots are written by the command streamer, by + * doing a CS stall and then MI_STORE_REGISTER_MEM. For these + * counters, the CS stall guarantees that the results will be + * available when subsequent CS commands run. So we don't need to + * do any additional tracking. + * + * Other counters (occlusion queries and timestamp) are written by + * PIPE_CONTROL, without a CS stall. This means that we can't be + * sure whether the writes have landed yet or not. Performing a + * PIPE_CONTROL with an immediate write will synchronize with + * those earlier writes, so we write 1 when the value has landed. + */ + if (brw->ctx.Extensions.ARB_query_buffer_object && + brw_is_query_pipelined(query)) { + brw_emit_pipe_control_write(brw, + PIPE_CONTROL_WRITE_IMMEDIATE, + query->bo, 2 * sizeof(uint64_t), + available, 0); } - - BEGIN_BATCH(5); - OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); - OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP); - OUT_RELOC(query_bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - PIPE_CONTROL_GLOBAL_GTT_WRITE | - idx * sizeof(uint64_t)); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); } -/** - * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. - */ static void -write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx) +write_primitives_generated(struct brw_context *brw, + drm_intel_bo *query_bo, int stream, int idx) { - struct intel_context *intel = &brw->intel; - /* Emit Sandybridge workaround flush: */ - if (intel->gen == 6) - intel_emit_post_sync_nonzero_flush(brw); - - BEGIN_BATCH(5); - OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); - OUT_BATCH(PIPE_CONTROL_DEPTH_STALL | - PIPE_CONTROL_WRITE_DEPTH_COUNT); - OUT_RELOC(query_bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - PIPE_CONTROL_GLOBAL_GTT_WRITE | - (idx * sizeof(uint64_t))); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); + brw_emit_mi_flush(brw); + + if (brw->gen >= 7 && stream > 0) { + brw_store_register_mem64(brw, query_bo, + GEN7_SO_PRIM_STORAGE_NEEDED(stream), + idx * sizeof(uint64_t)); + } else { + brw_store_register_mem64(brw, query_bo, CL_INVOCATION_COUNT, + idx * sizeof(uint64_t)); + } } -/* - * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM. - * - * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support; other - * counters have to be read via the generic MI_STORE_REGISTER_MEM. This - * function also performs a pipeline flush for proper synchronization. - */ static void -write_reg(struct brw_context *brw, - drm_intel_bo *query_bo, uint32_t reg, int idx) +write_xfb_primitives_written(struct brw_context *brw, + drm_intel_bo *bo, int stream, int idx) { - struct intel_context *intel = &brw->intel; - assert(intel->gen >= 6); - - intel_batchbuffer_emit_mi_flush(brw); + brw_emit_mi_flush(brw); - /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to - * read a full 64-bit register, we need to do two of them. - */ - BEGIN_BATCH(3); - OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); - OUT_BATCH(reg); - OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - idx * sizeof(uint64_t)); - ADVANCE_BATCH(); - - BEGIN_BATCH(3); - OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); - OUT_BATCH(reg + sizeof(uint32_t)); - OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - sizeof(uint32_t) + idx * sizeof(uint64_t)); - ADVANCE_BATCH(); + if (brw->gen >= 7) { + brw_store_register_mem64(brw, bo, GEN7_SO_NUM_PRIMS_WRITTEN(stream), + idx * sizeof(uint64_t)); + } else { + brw_store_register_mem64(brw, bo, GEN6_SO_NUM_PRIMS_WRITTEN, + idx * sizeof(uint64_t)); + } } -static void -write_primitives_generated(struct brw_context *brw, - drm_intel_bo *query_bo, int idx) +static inline const int +pipeline_target_to_index(int target) { - write_reg(brw, query_bo, CL_INVOCATION_COUNT, idx); + if (target == GL_GEOMETRY_SHADER_INVOCATIONS) + return MAX_PIPELINE_STATISTICS - 1; + else + return target - GL_VERTICES_SUBMITTED_ARB; } static void -write_xfb_primitives_written(struct brw_context *brw, - drm_intel_bo *query_bo, int idx) +emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo, + int stream, int target, int idx) { - struct intel_context *intel = &brw->intel; - if (intel->gen >= 7) { - write_reg(brw, query_bo, SO_NUM_PRIMS_WRITTEN0_IVB, idx); - } else { - write_reg(brw, query_bo, SO_NUM_PRIMS_WRITTEN, idx); - } + /* One source of confusion is the tessellation shader statistics. The + * hardware has no statistics specific to the TE unit. Ideally we could have + * the HS primitives for TESS_CONTROL_SHADER_PATCHES_ARB, and the DS + * invocations as the register for TESS_CONTROL_SHADER_PATCHES_ARB. + * Unfortunately we don't have HS primitives, we only have HS invocations. + */ + + /* Everything except GEOMETRY_SHADER_INVOCATIONS can be kept in a simple + * lookup table + */ + static const uint32_t target_to_register[] = { + IA_VERTICES_COUNT, /* VERTICES_SUBMITTED */ + IA_PRIMITIVES_COUNT, /* PRIMITIVES_SUBMITTED */ + VS_INVOCATION_COUNT, /* VERTEX_SHADER_INVOCATIONS */ + HS_INVOCATION_COUNT, /* TESS_CONTROL_SHADER_PATCHES */ + DS_INVOCATION_COUNT, /* TESS_EVALUATION_SHADER_INVOCATIONS */ + GS_PRIMITIVES_COUNT, /* GEOMETRY_SHADER_PRIMITIVES_EMITTED */ + PS_INVOCATION_COUNT, /* FRAGMENT_SHADER_INVOCATIONS */ + CS_INVOCATION_COUNT, /* COMPUTE_SHADER_INVOCATIONS */ + CL_INVOCATION_COUNT, /* CLIPPING_INPUT_PRIMITIVES */ + CL_PRIMITIVES_COUNT, /* CLIPPING_OUTPUT_PRIMITIVES */ + GS_INVOCATION_COUNT /* This one is special... */ + }; + STATIC_ASSERT(ARRAY_SIZE(target_to_register) == MAX_PIPELINE_STATISTICS); + uint32_t reg = target_to_register[pipeline_target_to_index(target)]; + /* Gen6 GS code counts full primitives, that is, it won't count individual + * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that. + */ + if (brw->gen == 6 && target == GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB) + reg = CL_INVOCATION_COUNT; + assert(reg != 0); + + /* Emit a flush to make sure various parts of the pipeline are complete and + * we get an accurate value + */ + brw_emit_mi_flush(brw); + + brw_store_register_mem64(brw, bo, reg, idx * sizeof(uint64_t)); } + /** * Wait on the query object's BO and calculate the final result. */ @@ -157,25 +161,11 @@ gen6_queryobj_get_results(struct gl_context *ctx, struct brw_query_object *query) { struct brw_context *brw = brw_context(ctx); - struct intel_context *intel = intel_context(ctx); if (query->bo == NULL) return; - /* If the application has requested the query result, but this batch is - * still contributing to it, flush it now so the results will be present - * when mapped. - */ - if (drm_intel_bo_references(brw->batch.bo, query->bo)) - intel_batchbuffer_flush(brw); - - if (unlikely(intel->perf_debug)) { - if (drm_intel_bo_busy(query->bo)) { - perf_debug("Stalling on the GPU waiting for a query object.\n"); - } - } - - drm_intel_bo_map(query->bo, false); + brw_bo_map(brw, query->bo, false, "query object"); uint64_t *results = query->bo->virtual; switch (query->Base.Target) { case GL_TIME_ELAPSED: @@ -223,12 +213,37 @@ gen6_queryobj_get_results(struct gl_context *ctx, case GL_PRIMITIVES_GENERATED: case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: + case GL_VERTICES_SUBMITTED_ARB: + case GL_PRIMITIVES_SUBMITTED_ARB: + case GL_VERTEX_SHADER_INVOCATIONS_ARB: + case GL_GEOMETRY_SHADER_INVOCATIONS: + case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: + case GL_CLIPPING_INPUT_PRIMITIVES_ARB: + case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: + case GL_COMPUTE_SHADER_INVOCATIONS_ARB: + case GL_TESS_CONTROL_SHADER_PATCHES_ARB: + case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: query->Base.Result = results[1] - results[0]; break; - default: - assert(!"Unrecognized query target in brw_queryobj_get_results()"); + case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: + query->Base.Result = (results[1] - results[0]); + /* Implement the "WaDividePSInvocationCountBy4:HSW,BDW" workaround: + * "Invocation counter is 4 times actual. WA: SW to divide HW reported + * PS Invocations value by 4." + * + * Prior to Haswell, invocation count was counted by the WM, and it + * buggily counted invocations in units of subspans (2x2 unit). To get the + * correct value, the CS multiplied this by 4. With HSW the logic moved, + * and correctly emitted the number of pixel shader invocations, but, + * whomever forgot to undo the multiply by 4. + */ + if (brw->gen == 8 || brw->is_haswell) + query->Base.Result /= 4; break; + + default: + unreachable("Unrecognized query target in brw_queryobj_get_results()"); } drm_intel_bo_unmap(query->bo); @@ -237,6 +252,8 @@ gen6_queryobj_get_results(struct gl_context *ctx, */ drm_intel_bo_unreference(query->bo); query->bo = NULL; + + query->Base.Ready = true; } /** @@ -255,6 +272,9 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) drm_intel_bo_unreference(query->bo); query->bo = drm_intel_bo_alloc(brw->bufmgr, "query results", 4096, 4096); + /* For ARB_query_buffer_object: The result is not available */ + set_query_availability(brw, query, false); + switch (query->Base.Target) { case GL_TIME_ELAPSED: /* For timestamp queries, we record the starting time right away so that @@ -276,26 +296,39 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) * obtain the time elapsed. Notably, this includes time elapsed while * the system was doing other work, such as running other applications. */ - write_timestamp(brw, query->bo, 0); + brw_write_timestamp(brw, query->bo, 0); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - write_depth_count(brw, query->bo, 0); + brw_write_depth_count(brw, query->bo, 0); break; case GL_PRIMITIVES_GENERATED: - write_primitives_generated(brw, query->bo, 0); + write_primitives_generated(brw, query->bo, query->Base.Stream, 0); break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - write_xfb_primitives_written(brw, query->bo, 0); + write_xfb_primitives_written(brw, query->bo, query->Base.Stream, 0); break; - default: - assert(!"Unrecognized query target in brw_begin_query()"); + case GL_VERTICES_SUBMITTED_ARB: + case GL_PRIMITIVES_SUBMITTED_ARB: + case GL_VERTEX_SHADER_INVOCATIONS_ARB: + case GL_GEOMETRY_SHADER_INVOCATIONS: + case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: + case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: + case GL_CLIPPING_INPUT_PRIMITIVES_ARB: + case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: + case GL_COMPUTE_SHADER_INVOCATIONS_ARB: + case GL_TESS_CONTROL_SHADER_PATCHES_ARB: + case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: + emit_pipeline_stat(brw, query->bo, query->Base.Stream, query->Base.Target, 0); break; + + default: + unreachable("Unrecognized query target in brw_begin_query()"); } } @@ -315,27 +348,65 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) switch (query->Base.Target) { case GL_TIME_ELAPSED: - write_timestamp(brw, query->bo, 1); + brw_write_timestamp(brw, query->bo, 1); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - write_depth_count(brw, query->bo, 1); + brw_write_depth_count(brw, query->bo, 1); break; case GL_PRIMITIVES_GENERATED: - write_primitives_generated(brw, query->bo, 1); + write_primitives_generated(brw, query->bo, query->Base.Stream, 1); break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - write_xfb_primitives_written(brw, query->bo, 1); + write_xfb_primitives_written(brw, query->bo, query->Base.Stream, 1); break; - default: - assert(!"Unrecognized query target in brw_end_query()"); + case GL_VERTICES_SUBMITTED_ARB: + case GL_PRIMITIVES_SUBMITTED_ARB: + case GL_VERTEX_SHADER_INVOCATIONS_ARB: + case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: + case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: + case GL_COMPUTE_SHADER_INVOCATIONS_ARB: + case GL_CLIPPING_INPUT_PRIMITIVES_ARB: + case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: + case GL_GEOMETRY_SHADER_INVOCATIONS: + case GL_TESS_CONTROL_SHADER_PATCHES_ARB: + case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: + emit_pipeline_stat(brw, query->bo, + query->Base.Stream, query->Base.Target, 1); break; + + default: + unreachable("Unrecognized query target in brw_end_query()"); } + + /* The current batch contains the commands to handle EndQuery(), + * but they won't actually execute until it is flushed. + */ + query->flushed = false; + + /* For ARB_query_buffer_object: The result is now available */ + set_query_availability(brw, query, true); +} + +/** + * Flush the batch if it still references the query object BO. + */ +static void +flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query) +{ + /* If the batch doesn't reference the BO, it must have been flushed + * (for example, due to being full). Record that it's been flushed. + */ + query->flushed = query->flushed || + !drm_intel_bo_references(brw->batch.bo, query->bo); + + if (!query->flushed) + intel_batchbuffer_flush(brw); } /** @@ -346,10 +417,16 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) */ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q) { + struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; + /* If the application has requested the query result, but this batch is + * still contributing to it, flush it now to finish that work so the + * result will become available (eventually). + */ + flush_batch_if_needed(brw, query); + gen6_queryobj_get_results(ctx, query); - query->Base.Ready = true; } /** @@ -363,6 +440,12 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; + /* If query->bo is NULL, we've already gathered the results - this is a + * redundant CheckQuery call. Ignore it. + */ + if (query->bo == NULL) + return; + /* From the GL_ARB_occlusion_query spec: * * "Instead of allowing for an infinite loop, performing a @@ -370,15 +453,22 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) * not ready yet on the first time it is queried. This ensures that * the async query will return true in finite time. */ - if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo)) - intel_batchbuffer_flush(brw); + flush_batch_if_needed(brw, query); - if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) { + if (!drm_intel_bo_busy(query->bo)) { gen6_queryobj_get_results(ctx, query); - query->Base.Ready = true; } } +static void +gen6_query_counter(struct gl_context *ctx, struct gl_query_object *q) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_query_object *query = (struct brw_query_object *)q; + brw_query_counter(ctx, q); + set_query_availability(brw, query, true); +} + /* Initialize Gen6+-specific query object functions. */ void gen6_init_queryobj_functions(struct dd_function_table *functions) { @@ -386,4 +476,5 @@ void gen6_init_queryobj_functions(struct dd_function_table *functions) functions->EndQuery = gen6_end_query; functions->CheckQuery = gen6_check_query; functions->WaitQuery = gen6_wait_query; + functions->QueryCounter = gen6_query_counter; }