i965: Remove never used RSR and RSL opcodes.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_state.c
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c

index 1eee5b7e5de8e593f88149a7df826c8859215a72..cdffac3b385759aaab98d66bbb315c253c42a1ef 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -36,71 +36,73 @@
  #include "brw_defines.h"
  #include "main/macros.h"
  
-struct brw_vs_unit_key {
-   unsigned int total_grf;
-   unsigned int urb_entry_read_length;
-   unsigned int curb_entry_read_length;
-
-   unsigned int curbe_offset;
-
-   unsigned int nr_urb_entries, urb_size;
-
-   unsigned int nr_surfaces;
-};
-
  static void
-brw_prepare_vs_unit(struct brw_context *brw)
+brw_upload_vs_unit(struct brw_context *brw)
  {
-   struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &intel->ctx;
+   struct brw_stage_state *stage_state = &brw->vs.base;
+
     struct brw_vs_unit_state *vs;
  
-   vs = brw_state_batch(brw, sizeof(*vs), 32, &brw->vs.state_offset);
+   vs = brw_state_batch(brw, AUB_TRACE_VS_STATE,
+                       sizeof(*vs), 32, &stage_state->state_offset);
     memset(vs, 0, sizeof(*vs));
  
-   /* CACHE_NEW_VS_PROG */
-   vs->thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
-   vs->thread0.grf_reg_count = ALIGN(brw->vs.prog_data->total_grf, 16) / 16 - 1;
-   vs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_VS_PROG */
+   vs->thread0.grf_reg_count =
+      ALIGN(brw->vs.prog_data->base.total_grf, 16) / 16 - 1;
+   vs->thread0.kernel_start_pointer =
+      brw_program_reloc(brw,
+                       stage_state->state_offset +
+                       offsetof(struct brw_vs_unit_state, thread0),
+                       stage_state->prog_offset +
+                       (vs->thread0.grf_reg_count << 1)) >> 6;
+
+   /* Use ALT floating point mode for ARB vertex programs, because they
+    * require 0^0 == 1.
+    */
+   if (brw->ctx.Shader.CurrentVertexProgram == NULL)
+      vs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   else
+      vs->thread1.floating_point_mode = BRW_FLOATING_POINT_IEEE_754;
+
     /* Choosing multiple program flow means that we may get 2-vertex threads,
      * which will have the channel mask for dwords 4-7 enabled in the thread,
      * and those dwords will be written to the second URB handle when we
      * brw_urb_WRITE() results.
      */
-   /* Disable single program flow on Ironlake.  We cannot reliably get
+   /* Force single program flow on Ironlake.  We cannot reliably get
      * all applications working without it.  See:
      * https://bugs.freedesktop.org/show_bug.cgi?id=29172
      *
      * The most notable and reliably failing application is the Humus
      * demo "CelShading"
     */
-   vs->thread1.single_program_flow = (intel->gen == 5);
-
-   /* BRW_NEW_NR_VS_SURFACES */
-   if (intel->gen == 5)
-      vs->thread1.binding_table_entry_count = 0; /* hardware requirement */
-   else
-      vs->thread1.binding_table_entry_count = brw->vs.nr_surfaces;
+   vs->thread1.single_program_flow = (brw->gen == 5);
  
-   vs->thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
-   vs->thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
-   vs->thread3.dispatch_grf_start_reg = 1;
-   vs->thread3.urb_entry_read_offset = 0;
+   vs->thread1.binding_table_entry_count = 0;
  
-   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
-      /* Note that we read in the userclip planes as well, hence
-       * clip_start:
-       */
-      vs->thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
-   }
-   else {
-      vs->thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
+   if (brw->vs.prog_data->base.total_scratch != 0) {
+      vs->thread2.scratch_space_base_pointer =
+        stage_state->scratch_bo->offset >> 10; /* reloc */
+      vs->thread2.per_thread_scratch_space =
+        ffs(brw->vs.prog_data->base.total_scratch) - 11;
+   } else {
+      vs->thread2.scratch_space_base_pointer = 0;
+      vs->thread2.per_thread_scratch_space = 0;
     }
  
+   vs->thread3.urb_entry_read_length = brw->vs.prog_data->base.urb_read_length;
+   vs->thread3.const_urb_entry_read_length
+      = brw->vs.prog_data->base.curb_read_length;
+   vs->thread3.dispatch_grf_start_reg =
+      brw->vs.prog_data->base.dispatch_grf_start_reg;
+   vs->thread3.urb_entry_read_offset = 0;
+
+   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM, BRW_NEW_VERTEX_PROGRAM */
+   vs->thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
  
     /* BRW_NEW_URB_FENCE */
-   if (intel->gen == 5) {
+   if (brw->gen == 5) {
        switch (brw->urb.nr_vs_entries) {
        case 8:
        case 12:
@@ -126,7 +128,7 @@ brw_prepare_vs_unit(struct brw_context *brw)
        case 32:
          break;
        case 64:
-        assert(intel->is_g4x);
+        assert(brw->is_g4x);
          break;
        default:
          assert(0);
@@ -137,13 +139,15 @@ brw_prepare_vs_unit(struct brw_context *brw)
     vs->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
  
     vs->thread4.max_threads = CLAMP(brw->urb.nr_vs_entries / 2,
-                                  1, brw->vs_max_threads) - 1;
+                                  1, brw->max_vs_threads) - 1;
+
+   if (brw->gen == 5)
+      vs->vs5.sampler_count = 0; /* hardware requirement */
+   else {
+      /* CACHE_NEW_SAMPLER */
+      vs->vs5.sampler_count = (stage_state->sampler_count + 3) / 4;
+   }
  
-   /* No samplers for ARB_vp programs:
-    */
-   /* It has to be set to 0 for Ironlake
-    */
-   vs->vs5.sampler_count = 0;
  
     if (unlikely(INTEL_DEBUG & DEBUG_STATS))
        vs->thread4.stats_enable = 1;
@@ -152,12 +156,29 @@ brw_prepare_vs_unit(struct brw_context *brw)
      */
     vs->vs6.vs_enable = 1;
  
-   /* Emit VS program relocation */
-   drm_intel_bo_emit_reloc(intel->batch.bo, (brw->vs.state_offset +
-                                            offsetof(struct brw_vs_unit_state,
-                                                     thread0)),
-                          brw->vs.prog_bo, vs->thread0.grf_reg_count << 1,
-                          I915_GEM_DOMAIN_INSTRUCTION, 0);
+   /* Set the sampler state pointer, and its reloc
+    */
+   if (stage_state->sampler_count) {
+      vs->vs5.sampler_state_pointer =
+         (brw->batch.bo->offset + stage_state->sampler_offset) >> 5;
+      drm_intel_bo_emit_reloc(brw->batch.bo,
+                              stage_state->state_offset +
+                              offsetof(struct brw_vs_unit_state, vs5),
+                              brw->batch.bo,
+                              (stage_state->sampler_offset |
+                               vs->vs5.sampler_count),
+                              I915_GEM_DOMAIN_INSTRUCTION, 0);
+   }
+
+   /* Emit scratch space relocation */
+   if (brw->vs.prog_data->base.total_scratch != 0) {
+      drm_intel_bo_emit_reloc(brw->batch.bo,
+                             stage_state->state_offset +
+                             offsetof(struct brw_vs_unit_state, thread2),
+                             stage_state->scratch_bo,
+                             vs->thread2.per_thread_scratch_space,
+                             I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
+   }
  
     brw->state.dirty.cache |= CACHE_NEW_VS_UNIT;
  }
@@ -166,10 +187,11 @@ const struct brw_tracked_state brw_vs_unit = {
     .dirty = {
        .mesa  = _NEW_TRANSFORM,
        .brw   = (BRW_NEW_BATCH |
+               BRW_NEW_PROGRAM_CACHE |
                 BRW_NEW_CURBE_OFFSETS |
-                BRW_NEW_NR_VS_SURFACES |
-               BRW_NEW_URB_FENCE),
-      .cache = CACHE_NEW_VS_PROG
+               BRW_NEW_URB_FENCE |
+                BRW_NEW_VERTEX_PROGRAM),
+      .cache = CACHE_NEW_VS_PROG | CACHE_NEW_SAMPLER
     },
-   .prepare = brw_prepare_vs_unit,
+   .emit = brw_upload_vs_unit,
  };