i965: Use state streaming on programs, and state base address on gen5+.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_state.c
index d3373ea79e80fb9b333adc3ad8da01b8f992ef39..506e2bdff5bf835bfe3b88e8c37d58f7fd5e21ad 100644 (file)
@@ -31,6 +31,7 @@
                    
 
 
+#include "intel_fbo.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
  * WM unit - fragment programs and rasterization
  */
 
-struct brw_wm_unit_key {
-   unsigned int total_grf, total_scratch;
-   unsigned int urb_entry_read_length;
-   unsigned int curb_entry_read_length;
-   unsigned int dispatch_grf_start_reg;
-
-   unsigned int curbe_offset;
-   unsigned int urb_size;
-
-   unsigned int nr_surfaces, sampler_count;
-   GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
-   GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
-   GLfloat offset_units, offset_factor;
-};
-
-static void
-wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
+bool
+brw_color_buffer_write_enabled(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct gl_context *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
-   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
-   struct intel_context *intel = &brw->intel;
-
-   memset(key, 0, sizeof(*key));
-
-   /* CACHE_NEW_WM_PROG */
-   key->total_grf = brw->wm.prog_data->total_grf;
-   key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
-   key->curb_entry_read_length = brw->wm.prog_data->curb_read_length;
-   key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
-   key->total_scratch = ALIGN(brw->wm.prog_data->total_scratch, 1024);
-
-   /* BRW_NEW_URB_FENCE */
-   key->urb_size = brw->urb.vsize;
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   key->curbe_offset = brw->curbe.wm_start;
-
-   /* BRW_NEW_NR_SURFACEs */
-   key->nr_surfaces = brw->wm.nr_surfaces;
-
-   /* CACHE_NEW_SAMPLER */
-   key->sampler_count = brw->wm.sampler_count;
-
-   /* _NEW_POLYGONSTIPPLE */
-   key->polygon_stipple = ctx->Polygon.StippleFlag;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   key->uses_depth = (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
-
-   /* as far as we can tell */
-   key->computes_depth =
-      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
-   /* BRW_NEW_DEPTH_BUFFER
-    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
-    * Depth field.
-    */
-   if (brw->state.depth_region == NULL)
-      key->computes_depth = 0;
-
-   /* _NEW_COLOR */
-   key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = bfp->isGLSL;
-
-   /* temporary sanity check assertion */
-   ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
-
-   /* _NEW_DEPTH */
-   key->stats_wm = intel->stats_wm;
-
-   /* _NEW_LINE */
-   key->line_stipple = ctx->Line.StippleFlag;
+   int i;
+
+   /* _NEW_BUFFERS */
+   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+
+      /* _NEW_COLOR */
+      if (rb &&
+         (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
+          fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
+         (ctx->Color.ColorMask[i][0] ||
+          ctx->Color.ColorMask[i][1] ||
+          ctx->Color.ColorMask[i][2] ||
+          ctx->Color.ColorMask[i][3])) {
+        return true;
+      }
+   }
 
-   /* _NEW_POLYGON */
-   key->offset_enable = ctx->Polygon.OffsetFill;
-   key->offset_units = ctx->Polygon.OffsetUnits;
-   key->offset_factor = ctx->Polygon.OffsetFactor;
+   return false;
 }
 
 /**
  * Setup wm hardware state.  See page 225 of Volume 2
  */
-static dri_bo *
-wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
-                       dri_bo **reloc_bufs)
+static void
+brw_prepare_wm_unit(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   struct brw_wm_unit_state wm;
-   dri_bo *bo;
+   struct gl_context *ctx = &intel->ctx;
+   const struct gl_fragment_program *fp = brw->fragment_program;
+   struct brw_wm_unit_state *wm;
 
-   memset(&wm, 0, sizeof(wm));
+   wm = brw_state_batch(brw, sizeof(*wm), 32, &brw->wm.state_offset);
+   memset(wm, 0, sizeof(*wm));
 
-   wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
-   wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
-   wm.thread1.depth_coef_urb_read_offset = 1;
-   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   if (brw->wm.prog_data->prog_offset_16) {
+      /* These two fields should be the same pre-gen6, which is why we
+       * only have one hardware field to program for both dispatch
+       * widths.
+       */
+      assert(brw->wm.prog_data->first_curbe_grf ==
+            brw->wm.prog_data->first_curbe_grf_16);
+   }
 
-   if (intel->is_ironlake)
-      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
-   else
-      wm.thread1.binding_table_entry_count = key->nr_surfaces;
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_WM_PROG */
+   wm->thread0.grf_reg_count = brw->wm.prog_data->reg_blocks;
+   wm->wm9.grf_reg_count_2 = brw->wm.prog_data->reg_blocks_16;
+
+   wm->thread0.kernel_start_pointer =
+      brw_program_reloc(brw,
+                       brw->wm.state_offset +
+                       offsetof(struct brw_wm_unit_state, thread0),
+                       brw->wm.prog_offset +
+                       (wm->thread0.grf_reg_count << 1)) >> 6;
+
+   wm->wm9.kernel_start_pointer_2 =
+      brw_program_reloc(brw,
+                       brw->wm.state_offset +
+                       offsetof(struct brw_wm_unit_state, wm9),
+                       brw->wm.prog_offset +
+                       brw->wm.prog_data->prog_offset_16 +
+                       (wm->wm9.grf_reg_count_2 << 1)) >> 6;
+
+   wm->thread1.depth_coef_urb_read_offset = 1;
+   wm->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   if (intel->gen == 5)
+      wm->thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else {
+      /* BRW_NEW_NR_SURFACES */
+      wm->thread1.binding_table_entry_count = brw->wm.nr_surfaces;
+   }
 
-   if (key->total_scratch != 0) {
-      wm.thread2.scratch_space_base_pointer =
+   if (brw->wm.prog_data->total_scratch != 0) {
+      wm->thread2.scratch_space_base_pointer =
         brw->wm.scratch_bo->offset >> 10; /* reloc */
-      wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1;
+      wm->thread2.per_thread_scratch_space =
+        ffs(brw->wm.prog_data->total_scratch) - 11;
    } else {
-      wm.thread2.scratch_space_base_pointer = 0;
-      wm.thread2.per_thread_scratch_space = 0;
+      wm->thread2.scratch_space_base_pointer = 0;
+      wm->thread2.per_thread_scratch_space = 0;
    }
 
-   wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
-   wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
-   wm.thread3.urb_entry_read_offset = 0;
-   wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
-   wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+   wm->thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   wm->thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   wm->thread3.urb_entry_read_offset = 0;
+   wm->thread3.const_urb_entry_read_length =
+      brw->wm.prog_data->curb_read_length;
+   /* BRW_NEW_CURBE_OFFSETS */
+   wm->thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
 
-   if (intel->is_ironlake)
-      wm.wm4.sampler_count = 0; /* hardware requirement */
-   else
-      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+   if (intel->gen == 5)
+      wm->wm4.sampler_count = 0; /* hardware requirement */
+   else {
+      /* CACHE_NEW_SAMPLER */
+      wm->wm4.sampler_count = (brw->wm.sampler_count + 1) / 4;
+   }
 
-   if (brw->wm.sampler_bo != NULL) {
+   if (brw->wm.sampler_count) {
       /* reloc */
-      wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5;
+      wm->wm4.sampler_state_pointer = (intel->batch.bo->offset +
+                                      brw->wm.sampler_offset) >> 5;
    } else {
-      wm.wm4.sampler_state_pointer = 0;
+      wm->wm4.sampler_state_pointer = 0;
+   }
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   wm->wm5.program_uses_depth = (fp->Base.InputsRead &
+                                (1 << FRAG_ATTRIB_WPOS)) != 0;
+   wm->wm5.program_computes_depth = (fp->Base.OutputsWritten &
+                                    BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
+   /* _NEW_BUFFERS
+    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
+    * Depth field.
+    */
+   if (!intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH))
+      wm->wm5.program_computes_depth = 0;
+
+   /* _NEW_COLOR */
+   wm->wm5.program_uses_killpixel = fp->UsesKill || ctx->Color.AlphaEnabled;
+
+
+   /* BRW_NEW_FRAGMENT_PROGRAM
+    *
+    * If using the fragment shader backend, the program is always
+    * 8-wide.  If not, it's always 16.
+    */
+   if (ctx->Shader.CurrentFragmentProgram) {
+      struct brw_shader *shader = (struct brw_shader *)
+        ctx->Shader.CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT];
+
+      if (shader != NULL && shader->ir != NULL) {
+        wm->wm5.enable_8_pix = 1;
+        if (brw->wm.prog_data->prog_offset_16)
+           wm->wm5.enable_16_pix = 1;
+      }
    }
+   if (!wm->wm5.enable_8_pix)
+      wm->wm5.enable_16_pix = 1;
 
-   wm.wm5.program_uses_depth = key->uses_depth;
-   wm.wm5.program_computes_depth = key->computes_depth;
-   wm.wm5.program_uses_killpixel = key->uses_kill;
+   wm->wm5.max_threads = brw->wm_max_threads - 1;
 
-   if (key->is_glsl)
-      wm.wm5.enable_8_pix = 1;
-   else
-      wm.wm5.enable_16_pix = 1;
+   /* _NEW_BUFFERS | _NEW_COLOR */
+   if (brw_color_buffer_write_enabled(brw) ||
+       wm->wm5.program_uses_killpixel ||
+       wm->wm5.program_computes_depth) {
+      wm->wm5.thread_dispatch_enable = 1;
+   }
 
-   wm.wm5.max_threads = brw->wm_max_threads - 1;
-   wm.wm5.thread_dispatch_enable = 1;  /* AKA: color_write */
-   wm.wm5.legacy_line_rast = 0;
-   wm.wm5.legacy_global_depth_bias = 0;
-   wm.wm5.early_depth_test = 1;                /* never need to disable */
-   wm.wm5.line_aa_region_width = 0;
-   wm.wm5.line_endcap_aa_region_width = 1;
+   wm->wm5.legacy_line_rast = 0;
+   wm->wm5.legacy_global_depth_bias = 0;
+   wm->wm5.early_depth_test = 1;               /* never need to disable */
+   wm->wm5.line_aa_region_width = 0;
+   wm->wm5.line_endcap_aa_region_width = 1;
 
-   wm.wm5.polygon_stipple = key->polygon_stipple;
+   /* _NEW_POLYGONSTIPPLE */
+   wm->wm5.polygon_stipple = ctx->Polygon.StippleFlag;
 
-   if (key->offset_enable) {
-      wm.wm5.depth_offset = 1;
+   /* _NEW_POLYGON */
+   if (ctx->Polygon.OffsetFill) {
+      wm->wm5.depth_offset = 1;
       /* Something wierd going on with legacy_global_depth_bias,
        * offset_constant, scaling and MRD.  This value passes glean
        * but gives some odd results elsewere (eg. the
        * quad-offset-units test).
        */
-      wm.global_depth_offset_constant = key->offset_units * 2;
+      wm->global_depth_offset_constant = ctx->Polygon.OffsetUnits * 2;
 
       /* This is the only value that passes glean:
        */
-      wm.global_depth_offset_scale = key->offset_factor;
+      wm->global_depth_offset_scale = ctx->Polygon.OffsetFactor;
    }
 
-   wm.wm5.line_stipple = key->line_stipple;
-
-   if (INTEL_DEBUG & DEBUG_STATS || key->stats_wm)
-      wm.wm4.stats_enable = 1;
-
-   bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
-                        key, sizeof(*key),
-                        reloc_bufs, 3,
-                        &wm, sizeof(wm),
-                        NULL, NULL);
+   /* _NEW_LINE */
+   wm->wm5.line_stipple = ctx->Line.StippleFlag;
 
-   /* Emit WM program relocation */
-   dri_bo_emit_reloc(bo,
-                    I915_GEM_DOMAIN_INSTRUCTION, 0,
-                    wm.thread0.grf_reg_count << 1,
-                    offsetof(struct brw_wm_unit_state, thread0),
-                    brw->wm.prog_bo);
+   /* _NEW_DEPTH */
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS) || intel->stats_wm)
+      wm->wm4.stats_enable = 1;
 
    /* Emit scratch space relocation */
-   if (key->total_scratch != 0) {
-      dri_bo_emit_reloc(bo,
-                       0, 0,
-                       wm.thread2.per_thread_scratch_space,
-                       offsetof(struct brw_wm_unit_state, thread2),
-                       brw->wm.scratch_bo);
+   if (brw->wm.prog_data->total_scratch != 0) {
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+                             brw->wm.state_offset +
+                             offsetof(struct brw_wm_unit_state, thread2),
+                             brw->wm.scratch_bo,
+                             wm->thread2.per_thread_scratch_space,
+                             I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
    }
 
    /* Emit sampler state relocation */
-   if (key->sampler_count != 0) {
-      dri_bo_emit_reloc(bo,
-                       I915_GEM_DOMAIN_INSTRUCTION, 0,
-                       wm.wm4.stats_enable | (wm.wm4.sampler_count << 2),
-                       offsetof(struct brw_wm_unit_state, wm4),
-                       brw->wm.sampler_bo);
+   if (brw->wm.sampler_count != 0) {
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+                             brw->wm.state_offset +
+                             offsetof(struct brw_wm_unit_state, wm4),
+                             intel->batch.bo, (brw->wm.sampler_offset |
+                                               wm->wm4.stats_enable |
+                                               (wm->wm4.sampler_count << 2)),
+                             I915_GEM_DOMAIN_INSTRUCTION, 0);
    }
 
-   return bo;
-}
-
-
-static void upload_wm_unit( struct brw_context *brw )
-{
-   struct intel_context *intel = &brw->intel;
-   struct brw_wm_unit_key key;
-   dri_bo *reloc_bufs[3];
-   wm_unit_populate_key(brw, &key);
-
-   /* Allocate the necessary scratch space if we haven't already.  Don't
-    * bother reducing the allocation later, since we use scratch so
-    * rarely.
-    */
-   assert(key.total_scratch <= 12 * 1024);
-   if (key.total_scratch) {
-      GLuint total = key.total_scratch * brw->wm_max_threads;
-
-      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
-        dri_bo_unreference(brw->wm.scratch_bo);
-        brw->wm.scratch_bo = NULL;
-      }
-      if (brw->wm.scratch_bo == NULL) {
-        brw->wm.scratch_bo = dri_bo_alloc(intel->bufmgr,
-                                           "wm scratch",
-                                           total,
-                                           4096);
-      }
-   }
-
-   reloc_bufs[0] = brw->wm.prog_bo;
-   reloc_bufs[1] = brw->wm.scratch_bo;
-   reloc_bufs[2] = brw->wm.sampler_bo;
-
-   dri_bo_unreference(brw->wm.state_bo);
-   brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT,
-                                      &key, sizeof(key),
-                                      reloc_bufs, 3,
-                                      NULL);
-   if (brw->wm.state_bo == NULL) {
-      brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs);
-   }
+   brw->state.dirty.cache |= CACHE_NEW_WM_UNIT;
 }
 
 const struct brw_tracked_state brw_wm_unit = {
@@ -289,16 +256,18 @@ const struct brw_tracked_state brw_wm_unit = {
               _NEW_POLYGONSTIPPLE | 
               _NEW_LINE | 
               _NEW_COLOR |
-              _NEW_DEPTH),
+              _NEW_DEPTH |
+              _NEW_BUFFERS),
 
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
+      .brw = (BRW_NEW_BATCH |
+             BRW_NEW_PROGRAM_CACHE |
+             BRW_NEW_FRAGMENT_PROGRAM |
              BRW_NEW_CURBE_OFFSETS |
-             BRW_NEW_DEPTH_BUFFER |
              BRW_NEW_NR_WM_SURFACES),
 
       .cache = (CACHE_NEW_WM_PROG |
                CACHE_NEW_SAMPLER)
    },
-   .prepare = upload_wm_unit,
+   .prepare = brw_prepare_wm_unit,
 };