From 1973845fbd0af05dc252ead57fae39d1f866dd9e Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 16 Oct 2012 11:13:49 -0700
Subject: [PATCH] i965: Don't flush the batch immediately on EndQuery.

The theory I had when I wrote the code was that you wanted to minimize latency
on your queries because the app was going to ask soon.  Only, it turns out
that everybody batches up their queries and asks for the results later (often
after the next SwapBuffers!), so this was a pessimization.

Until now, I had no workload where it mattered enough to benchmark.  Recently
I started playing some Minecraft, which uses tons of queries to decide whether
to render chunks of the terrain.  For that app, avoiding the flush in the
query-generation loop improves performance 22.7% +/- 4.7% (n=3) on an apitrace
capture of it (confirmed in game by watching the fps meter found by pressing
F3, 15/16 -> 20/21 fps).

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_queryobj.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index d5c4fdf2725..89420e95320 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -142,6 +142,9 @@ brw_queryobj_get_results(struct gl_context *ctx,
    if (query->bo == NULL)
       return;
 
+   if (drm_intel_bo_references(intel->batch.bo, query->bo))
+      intel_batchbuffer_flush(intel);
+
    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
       if (drm_intel_bo_busy(query->bo)) {
          perf_debug("Stalling on the GPU waiting for a query object.\n");
@@ -303,13 +306,8 @@ brw_end_query(struct gl_context *ctx, struct gl_query_object *q)
       break;
 
    case GL_SAMPLES_PASSED_ARB:
-      /* Flush the batchbuffer in case it has writes to our query BO.
-       * Have later queries write to a new query BO so that further rendering
-       * doesn't delay the collection of our results.
-       */
       if (query->bo) {
 	 brw_emit_query_end(brw);
-	 intel_batchbuffer_flush(intel);
 
 	 drm_intel_bo_unreference(brw->query.bo);
 	 brw->query.bo = NULL;
@@ -364,8 +362,19 @@ static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q)
 
 static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
 {
+   struct intel_context *intel = intel_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
+   /* From the GL_ARB_occlusion_query spec:
+    *
+    *     "Instead of allowing for an infinite loop, performing a
+    *      QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is
+    *      not ready yet on the first time it is queried.  This ensures that
+    *      the async query will return true in finite time.
+    */
+   if (query->bo && drm_intel_bo_references(intel->batch.bo, query->bo))
+      intel_batchbuffer_flush(intel);
+
    if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
       brw_queryobj_get_results(ctx, query);
       query->Base.Ready = true;
-- 
2.30.2