intel: Add a batch flush between front-buffer downsample and X protocol.

[mesa.git] / src / mesa / drivers / dri / i965 / gen6_queryobj.c
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c

index 3f2ed00f92dfc9cfed7c82c400119870406e9b42..498b1877b6d5abf8952d1fa9ba6c0d3ddc387d17 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -43,10 +43,10 @@
   * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
   */
  static void
-write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
+write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
  {
     /* Emit workaround flushes: */
-   if (intel->gen == 6) {
+   if (brw->gen == 6) {
        /* The timestamp write below is a non-zero post-sync op, which on
         * Gen6 necessitates a CS stall.  CS stalls need stall at scoreboard
         * set.  See the comments for intel_emit_post_sync_nonzero_flush().
@@ -75,11 +75,11 @@ write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
   * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer.
   */
  static void
-write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
+write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
  {
     /* Emit Sandybridge workaround flush: */
-   if (intel->gen == 6)
-      intel_emit_post_sync_nonzero_flush(intel);
+   if (brw->gen == 6)
+      intel_emit_post_sync_nonzero_flush(brw);
  
     BEGIN_BATCH(5);
     OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
@@ -94,6 +94,57 @@ write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
     ADVANCE_BATCH();
  }
  
+/*
+ * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
+ *
+ * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support; other
+ * counters have to be read via the generic MI_STORE_REGISTER_MEM.  This
+ * function also performs a pipeline flush for proper synchronization.
+ */
+static void
+write_reg(struct brw_context *brw,
+          drm_intel_bo *query_bo, uint32_t reg, int idx)
+{
+   assert(brw->gen >= 6);
+
+   intel_batchbuffer_emit_mi_flush(brw);
+
+   /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
+    * read a full 64-bit register, we need to do two of them.
+    */
+   BEGIN_BATCH(3);
+   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
+   OUT_BATCH(reg);
+   OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+             idx * sizeof(uint64_t));
+   ADVANCE_BATCH();
+
+   BEGIN_BATCH(3);
+   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
+   OUT_BATCH(reg + sizeof(uint32_t));
+   OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+             sizeof(uint32_t) + idx * sizeof(uint64_t));
+   ADVANCE_BATCH();
+}
+
+static void
+write_primitives_generated(struct brw_context *brw,
+                           drm_intel_bo *query_bo, int idx)
+{
+   write_reg(brw, query_bo, CL_INVOCATION_COUNT, idx);
+}
+
+static void
+write_xfb_primitives_written(struct brw_context *brw,
+                             drm_intel_bo *query_bo, int idx)
+{
+   if (brw->gen >= 7) {
+      write_reg(brw, query_bo, GEN7_SO_NUM_PRIMS_WRITTEN(0), idx);
+   } else {
+      write_reg(brw, query_bo, GEN6_SO_NUM_PRIMS_WRITTEN, idx);
+   }
+}
+
  /**
   * Wait on the query object's BO and calculate the final result.
   */
@@ -101,7 +152,7 @@ static void
  gen6_queryobj_get_results(struct gl_context *ctx,
                            struct brw_query_object *query)
  {
-   struct intel_context *intel = intel_context(ctx);
+   struct brw_context *brw = brw_context(ctx);
  
     if (query->bo == NULL)
        return;
@@ -110,10 +161,10 @@ gen6_queryobj_get_results(struct gl_context *ctx,
      * still contributing to it, flush it now so the results will be present
      * when mapped.
      */
-   if (drm_intel_bo_references(intel->batch.bo, query->bo))
-      intel_batchbuffer_flush(intel);
+   if (drm_intel_bo_references(brw->batch.bo, query->bo))
+      intel_batchbuffer_flush(brw);
  
-   if (unlikely(intel->perf_debug)) {
+   if (unlikely(brw->perf_debug)) {
        if (drm_intel_bo_busy(query->bo)) {
           perf_debug("Stalling on the GPU waiting for a query object.\n");
        }
@@ -167,10 +218,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
  
     case GL_PRIMITIVES_GENERATED:
     case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-      /* We don't actually query the hardware for this value, so query->bo
-       * should always be NULL and execution should never reach here.
-       */
-      assert(!"Unreachable");
+      query->Base.Result = results[1] - results[0];
        break;
  
     default:
@@ -196,9 +244,12 @@ static void
  gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
  {
     struct brw_context *brw = brw_context(ctx);
-   struct intel_context *intel = intel_context(ctx);
     struct brw_query_object *query = (struct brw_query_object *)q;
  
+   /* Since we're starting a new query, we need to throw away old results. */
+   drm_intel_bo_unreference(query->bo);
+   query->bo = drm_intel_bo_alloc(brw->bufmgr, "query results", 4096, 4096);
+
     switch (query->Base.Target) {
     case GL_TIME_ELAPSED:
        /* For timestamp queries, we record the starting time right away so that
@@ -220,36 +271,21 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
         * obtain the time elapsed.  Notably, this includes time elapsed while
         * the system was doing other work, such as running other applications.
         */
-      drm_intel_bo_unreference(query->bo);
-      query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096, 4096);
-      write_timestamp(intel, query->bo, 0);
+      write_timestamp(brw, query->bo, 0);
        break;
  
     case GL_ANY_SAMPLES_PASSED:
     case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
     case GL_SAMPLES_PASSED_ARB:
-      /* Since we're starting a new query, we need to be sure to throw away
-       * any previous occlusion query results.
-       */
-      drm_intel_bo_unreference(query->bo);
-      query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096, 4096);
-      write_depth_count(intel, query->bo, 0);
+      write_depth_count(brw, query->bo, 0);
        break;
  
     case GL_PRIMITIVES_GENERATED:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it a software counter.  So just reset the counter.
-       */
-      brw->sol.primitives_generated = 0;
-      brw->sol.counting_primitives_generated = true;
+      write_primitives_generated(brw, query->bo, 0);
        break;
  
     case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it a software counter.  So just reset the counter.
-       */
-      brw->sol.primitives_written = 0;
-      brw->sol.counting_primitives_written = true;
+      write_xfb_primitives_written(brw, query->bo, 0);
        break;
  
     default:
@@ -270,36 +306,25 @@ static void
  gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
  {
     struct brw_context *brw = brw_context(ctx);
-   struct intel_context *intel = intel_context(ctx);
     struct brw_query_object *query = (struct brw_query_object *)q;
  
     switch (query->Base.Target) {
     case GL_TIME_ELAPSED:
-      write_timestamp(intel, query->bo, 1);
+      write_timestamp(brw, query->bo, 1);
        break;
  
     case GL_ANY_SAMPLES_PASSED:
     case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
     case GL_SAMPLES_PASSED_ARB:
-      write_depth_count(intel, query->bo, 1);
+      write_depth_count(brw, query->bo, 1);
        break;
  
     case GL_PRIMITIVES_GENERATED:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it in a software counter.  So just read the counter and store it in
-       * the query object.
-       */
-      query->Base.Result = brw->sol.primitives_generated;
-      brw->sol.counting_primitives_generated = false;
+      write_primitives_generated(brw, query->bo, 1);
        break;
  
     case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it in a software counter.  So just read the counter and store it in
-       * the query object.
-       */
-      query->Base.Result = brw->sol.primitives_written;
-      brw->sol.counting_primitives_written = false;
+      write_xfb_primitives_written(brw, query->bo, 1);
        break;
  
     default:
@@ -330,7 +355,7 @@ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
   */
  static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
  {
-   struct intel_context *intel = intel_context(ctx);
+   struct brw_context *brw = brw_context(ctx);
     struct brw_query_object *query = (struct brw_query_object *)q;
  
     /* From the GL_ARB_occlusion_query spec:
@@ -340,8 +365,8 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
      *      not ready yet on the first time it is queried.  This ensures that
      *      the async query will return true in finite time.
      */
-   if (query->bo && drm_intel_bo_references(intel->batch.bo, query->bo))
-      intel_batchbuffer_flush(intel);
+   if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo))
+      intel_batchbuffer_flush(brw);
  
     if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
        gen6_queryobj_get_results(ctx, query);