X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fgen6_queryobj.c;h=d508c4c9278f78a7e66bcce301287eb6668833b4;hb=a7e9b31d5bf98bdaabbb8b5c2459eb2c3a0af579;hp=56e9d5db9374f6c4f8cc2c4e709b75923a840158;hpb=32a3f5f6d768e5828be1d1f46b1b3f819f55cba8;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c index 56e9d5db937..d508c4c9278 100644 --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c @@ -39,61 +39,6 @@ #include "intel_batchbuffer.h" #include "intel_reg.h" -/** - * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. - */ -static void -write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx) -{ - /* Emit workaround flushes: */ - if (brw->gen == 6) { - /* The timestamp write below is a non-zero post-sync op, which on - * Gen6 necessitates a CS stall. CS stalls need stall at scoreboard - * set. See the comments for intel_emit_post_sync_nonzero_flush(). - */ - BEGIN_BATCH(4); - OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2)); - OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); - } - - BEGIN_BATCH(5); - OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); - OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP); - OUT_RELOC(query_bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - PIPE_CONTROL_GLOBAL_GTT_WRITE | - idx * sizeof(uint64_t)); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); -} - -/** - * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. - */ -static void -write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx) -{ - /* Emit Sandybridge workaround flush: */ - if (brw->gen == 6) - intel_emit_post_sync_nonzero_flush(brw); - - BEGIN_BATCH(5); - OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); - OUT_BATCH(PIPE_CONTROL_DEPTH_STALL | - PIPE_CONTROL_WRITE_DEPTH_COUNT); - OUT_RELOC(query_bo, - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - PIPE_CONTROL_GLOBAL_GTT_WRITE | - (idx * sizeof(uint64_t))); - OUT_BATCH(0); - OUT_BATCH(0); - ADVANCE_BATCH(); -} - /* * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM. * @@ -112,43 +57,112 @@ brw_store_register_mem64(struct brw_context *brw, /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to * read a full 64-bit register, we need to do two of them. */ - BEGIN_BATCH(3); - OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); - OUT_BATCH(reg); - OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - idx * sizeof(uint64_t)); - ADVANCE_BATCH(); - - BEGIN_BATCH(3); - OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); - OUT_BATCH(reg + sizeof(uint32_t)); - OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - sizeof(uint32_t) + idx * sizeof(uint64_t)); - ADVANCE_BATCH(); + if (brw->gen >= 8) { + BEGIN_BATCH(8); + OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2)); + OUT_BATCH(reg); + OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + idx * sizeof(uint64_t)); + OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2)); + OUT_BATCH(reg + sizeof(uint32_t)); + OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + sizeof(uint32_t) + idx * sizeof(uint64_t)); + ADVANCE_BATCH(); + } else { + BEGIN_BATCH(6); + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); + OUT_BATCH(reg); + OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + idx * sizeof(uint64_t)); + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); + OUT_BATCH(reg + sizeof(uint32_t)); + OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + sizeof(uint32_t) + idx * sizeof(uint64_t)); + ADVANCE_BATCH(); + } } static void write_primitives_generated(struct brw_context *brw, - drm_intel_bo *query_bo, int idx) + drm_intel_bo *query_bo, int stream, int idx) { - intel_batchbuffer_emit_mi_flush(brw); + brw_emit_mi_flush(brw); - brw_store_register_mem64(brw, query_bo, CL_INVOCATION_COUNT, idx); + if (brw->gen >= 7 && stream > 0) { + brw_store_register_mem64(brw, query_bo, + GEN7_SO_PRIM_STORAGE_NEEDED(stream), idx); + } else { + brw_store_register_mem64(brw, query_bo, CL_INVOCATION_COUNT, idx); + } } static void write_xfb_primitives_written(struct brw_context *brw, - drm_intel_bo *bo, int idx) + drm_intel_bo *bo, int stream, int idx) { - intel_batchbuffer_emit_mi_flush(brw); + brw_emit_mi_flush(brw); if (brw->gen >= 7) { - brw_store_register_mem64(brw, bo, GEN7_SO_NUM_PRIMS_WRITTEN(0), idx); + brw_store_register_mem64(brw, bo, GEN7_SO_NUM_PRIMS_WRITTEN(stream), idx); } else { brw_store_register_mem64(brw, bo, GEN6_SO_NUM_PRIMS_WRITTEN, idx); } } +static inline const int +pipeline_target_to_index(int target) +{ + if (target == GL_GEOMETRY_SHADER_INVOCATIONS) + return MAX_PIPELINE_STATISTICS - 1; + else + return target - GL_VERTICES_SUBMITTED_ARB; +} + +static void +emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo, + int stream, int target, int idx) +{ + /* One source of confusion is the tessellation shader statistics. The + * hardware has no statistics specific to the TE unit. Ideally we could have + * the HS primitives for TESS_CONTROL_SHADER_PATCHES_ARB, and the DS + * invocations as the register for TESS_CONTROL_SHADER_PATCHES_ARB. + * Unfortunately we don't have HS primitives, we only have HS invocations. + */ + + /* Everything except GEOMETRY_SHADER_INVOCATIONS can be kept in a simple + * lookup table + */ + static const uint32_t target_to_register[] = { + IA_VERTICES_COUNT, /* VERTICES_SUBMITTED */ + IA_PRIMITIVES_COUNT, /* PRIMITIVES_SUBMITTED */ + VS_INVOCATION_COUNT, /* VERTEX_SHADER_INVOCATIONS */ + HS_INVOCATION_COUNT, /* TESS_CONTROL_SHADER_PATCHES */ + DS_INVOCATION_COUNT, /* TESS_EVALUATION_SHADER_INVOCATIONS */ + GS_PRIMITIVES_COUNT, /* GEOMETRY_SHADER_PRIMITIVES_EMITTED */ + PS_INVOCATION_COUNT, /* FRAGMENT_SHADER_INVOCATIONS */ + CS_INVOCATION_COUNT, /* COMPUTE_SHADER_INVOCATIONS */ + CL_INVOCATION_COUNT, /* CLIPPING_INPUT_PRIMITIVES */ + CL_PRIMITIVES_COUNT, /* CLIPPING_OUTPUT_PRIMITIVES */ + GS_INVOCATION_COUNT /* This one is special... */ + }; + STATIC_ASSERT(ARRAY_SIZE(target_to_register) == MAX_PIPELINE_STATISTICS); + uint32_t reg = target_to_register[pipeline_target_to_index(target)]; + /* Gen6 GS code counts full primitives, that is, it won't count individual + * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that. + */ + if (brw->gen == 6 && target == GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB) + reg = CL_INVOCATION_COUNT; + assert(reg != 0); + + /* Emit a flush to make sure various parts of the pipeline are complete and + * we get an accurate value + */ + brw_emit_mi_flush(brw); + + brw_store_register_mem64(brw, bo, reg, idx); +} + + /** * Wait on the query object's BO and calculate the final result. */ @@ -161,20 +175,7 @@ gen6_queryobj_get_results(struct gl_context *ctx, if (query->bo == NULL) return; - /* If the application has requested the query result, but this batch is - * still contributing to it, flush it now so the results will be present - * when mapped. - */ - if (drm_intel_bo_references(brw->batch.bo, query->bo)) - intel_batchbuffer_flush(brw); - - if (unlikely(brw->perf_debug)) { - if (drm_intel_bo_busy(query->bo)) { - perf_debug("Stalling on the GPU waiting for a query object.\n"); - } - } - - drm_intel_bo_map(query->bo, false); + brw_bo_map(brw, query->bo, false, "query object"); uint64_t *results = query->bo->virtual; switch (query->Base.Target) { case GL_TIME_ELAPSED: @@ -222,12 +223,37 @@ gen6_queryobj_get_results(struct gl_context *ctx, case GL_PRIMITIVES_GENERATED: case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: + case GL_VERTICES_SUBMITTED_ARB: + case GL_PRIMITIVES_SUBMITTED_ARB: + case GL_VERTEX_SHADER_INVOCATIONS_ARB: + case GL_GEOMETRY_SHADER_INVOCATIONS: + case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: + case GL_CLIPPING_INPUT_PRIMITIVES_ARB: + case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: + case GL_COMPUTE_SHADER_INVOCATIONS_ARB: + case GL_TESS_CONTROL_SHADER_PATCHES_ARB: + case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: query->Base.Result = results[1] - results[0]; break; - default: - assert(!"Unrecognized query target in brw_queryobj_get_results()"); + case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: + query->Base.Result = (results[1] - results[0]); + /* Implement the "WaDividePSInvocationCountBy4:HSW,BDW" workaround: + * "Invocation counter is 4 times actual. WA: SW to divide HW reported + * PS Invocations value by 4." + * + * Prior to Haswell, invocation count was counted by the WM, and it + * buggily counted invocations in units of subspans (2x2 unit). To get the + * correct value, the CS multiplied this by 4. With HSW the logic moved, + * and correctly emitted the number of pixel shader invocations, but, + * whomever forgot to undo the multiply by 4. + */ + if (brw->gen == 8 || brw->is_haswell) + query->Base.Result /= 4; break; + + default: + unreachable("Unrecognized query target in brw_queryobj_get_results()"); } drm_intel_bo_unmap(query->bo); @@ -236,6 +262,8 @@ gen6_queryobj_get_results(struct gl_context *ctx, */ drm_intel_bo_unreference(query->bo); query->bo = NULL; + + query->Base.Ready = true; } /** @@ -275,26 +303,39 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) * obtain the time elapsed. Notably, this includes time elapsed while * the system was doing other work, such as running other applications. */ - write_timestamp(brw, query->bo, 0); + brw_write_timestamp(brw, query->bo, 0); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - write_depth_count(brw, query->bo, 0); + brw_write_depth_count(brw, query->bo, 0); break; case GL_PRIMITIVES_GENERATED: - write_primitives_generated(brw, query->bo, 0); + write_primitives_generated(brw, query->bo, query->Base.Stream, 0); break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - write_xfb_primitives_written(brw, query->bo, 0); + write_xfb_primitives_written(brw, query->bo, query->Base.Stream, 0); break; - default: - assert(!"Unrecognized query target in brw_begin_query()"); + case GL_VERTICES_SUBMITTED_ARB: + case GL_PRIMITIVES_SUBMITTED_ARB: + case GL_VERTEX_SHADER_INVOCATIONS_ARB: + case GL_GEOMETRY_SHADER_INVOCATIONS: + case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: + case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: + case GL_CLIPPING_INPUT_PRIMITIVES_ARB: + case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: + case GL_COMPUTE_SHADER_INVOCATIONS_ARB: + case GL_TESS_CONTROL_SHADER_PATCHES_ARB: + case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: + emit_pipeline_stat(brw, query->bo, query->Base.Stream, query->Base.Target, 0); break; + + default: + unreachable("Unrecognized query target in brw_begin_query()"); } } @@ -314,27 +355,62 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) switch (query->Base.Target) { case GL_TIME_ELAPSED: - write_timestamp(brw, query->bo, 1); + brw_write_timestamp(brw, query->bo, 1); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - write_depth_count(brw, query->bo, 1); + brw_write_depth_count(brw, query->bo, 1); break; case GL_PRIMITIVES_GENERATED: - write_primitives_generated(brw, query->bo, 1); + write_primitives_generated(brw, query->bo, query->Base.Stream, 1); break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - write_xfb_primitives_written(brw, query->bo, 1); + write_xfb_primitives_written(brw, query->bo, query->Base.Stream, 1); break; - default: - assert(!"Unrecognized query target in brw_end_query()"); + case GL_VERTICES_SUBMITTED_ARB: + case GL_PRIMITIVES_SUBMITTED_ARB: + case GL_VERTEX_SHADER_INVOCATIONS_ARB: + case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: + case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: + case GL_COMPUTE_SHADER_INVOCATIONS_ARB: + case GL_CLIPPING_INPUT_PRIMITIVES_ARB: + case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: + case GL_GEOMETRY_SHADER_INVOCATIONS: + case GL_TESS_CONTROL_SHADER_PATCHES_ARB: + case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: + emit_pipeline_stat(brw, query->bo, + query->Base.Stream, query->Base.Target, 1); break; + + default: + unreachable("Unrecognized query target in brw_end_query()"); } + + /* The current batch contains the commands to handle EndQuery(), + * but they won't actually execute until it is flushed. + */ + query->flushed = false; +} + +/** + * Flush the batch if it still references the query object BO. + */ +static void +flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query) +{ + /* If the batch doesn't reference the BO, it must have been flushed + * (for example, due to being full). Record that it's been flushed. + */ + query->flushed = query->flushed || + !drm_intel_bo_references(brw->batch.bo, query->bo); + + if (!query->flushed) + intel_batchbuffer_flush(brw); } /** @@ -345,10 +421,16 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) */ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q) { + struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; + /* If the application has requested the query result, but this batch is + * still contributing to it, flush it now to finish that work so the + * result will become available (eventually). + */ + flush_batch_if_needed(brw, query); + gen6_queryobj_get_results(ctx, query); - query->Base.Ready = true; } /** @@ -362,6 +444,12 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; + /* If query->bo is NULL, we've already gathered the results - this is a + * redundant CheckQuery call. Ignore it. + */ + if (query->bo == NULL) + return; + /* From the GL_ARB_occlusion_query spec: * * "Instead of allowing for an infinite loop, performing a @@ -369,12 +457,10 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) * not ready yet on the first time it is queried. This ensures that * the async query will return true in finite time. */ - if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo)) - intel_batchbuffer_flush(brw); + flush_batch_if_needed(brw, query); - if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) { + if (!drm_intel_bo_busy(query->bo)) { gen6_queryobj_get_results(ctx, query); - query->Base.Ready = true; } }