i965/nir: Sort uniforms direct-first and use two different uniform registers
[mesa.git] / src / mesa / drivers / dri / i965 / brw_state_upload.c
index 4ca7cb48ed67c51eea53944164b015ec3e882b83..1b848593de654ba9fb22788abcf7ee165a7e1418 100644 (file)
 #include "drivers/common/meta.h"
 #include "intel_batchbuffer.h"
 #include "intel_buffers.h"
+#include "brw_vs.h"
+#include "brw_ff_gs.h"
+#include "brw_gs.h"
+#include "brw_wm.h"
 
 static const struct brw_tracked_state *gen4_atoms[] =
 {
-   &brw_vs_prog, /* must do before GS prog, state base address. */
-   &brw_ff_gs_prog, /* must do before state base address */
-
    &brw_interpolation_map,
 
    &brw_clip_prog, /* must do before state base address */
    &brw_sf_prog, /* must do before state base address */
-   &brw_wm_prog, /* must do before state base address */
 
    /* Once all the programs are done, we know how large urb entry
     * sizes need to be and can decide if we need to change the urb
@@ -107,10 +107,6 @@ static const struct brw_tracked_state *gen4_atoms[] =
 
 static const struct brw_tracked_state *gen6_atoms[] =
 {
-   &brw_vs_prog, /* must do before state base address */
-   &brw_gs_prog, /* must do before state base address */
-   &brw_wm_prog, /* must do before state base address */
-
    &gen6_clip_vp,
    &gen6_sf_vp,
 
@@ -180,17 +176,12 @@ static const struct brw_tracked_state *gen6_atoms[] =
 
 static const struct brw_tracked_state *gen7_atoms[] =
 {
-   &brw_vs_prog,
-   &brw_gs_prog,
-   &brw_wm_prog,
-
    /* Command packets: */
 
    /* must do before binding table pointers, cc state ptrs */
    &brw_state_base_address,
 
    &brw_cc_vp,
-   &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
    &gen7_sf_clip_viewport,
 
    &gen7_push_constant_space,
@@ -257,15 +248,10 @@ static const struct brw_tracked_state *gen7_atoms[] =
 
 static const struct brw_tracked_state *gen8_atoms[] =
 {
-   &brw_vs_prog,
-   &brw_gs_prog,
-   &brw_wm_prog,
-
    /* Command packets: */
    &gen8_state_base_address,
 
    &brw_cc_vp,
-   &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
    &gen8_sf_clip_viewport,
 
    &gen7_push_constant_space,
@@ -333,6 +319,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
    &gen8_vertices,
 
    &haswell_cut_index,
+   &gen8_pma_fix,
 };
 
 static void
@@ -345,6 +332,9 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
    if (!brw->hw_ctx)
       return;
 
+   if (brw->gen == 6)
+      intel_emit_post_sync_nonzero_flush(brw);
+
    brw_upload_invariant_state(brw);
 
    if (brw->gen >= 8) {
@@ -358,6 +348,11 @@ void brw_init_state( struct brw_context *brw )
    const struct brw_tracked_state **atoms;
    int num_atoms;
 
+   STATIC_ASSERT(ARRAY_SIZE(gen4_atoms) <= ARRAY_SIZE(brw->atoms));
+   STATIC_ASSERT(ARRAY_SIZE(gen6_atoms) <= ARRAY_SIZE(brw->atoms));
+   STATIC_ASSERT(ARRAY_SIZE(gen7_atoms) <= ARRAY_SIZE(brw->atoms));
+   STATIC_ASSERT(ARRAY_SIZE(gen8_atoms) <= ARRAY_SIZE(brw->atoms));
+
    brw_init_caches(brw);
 
    if (brw->gen >= 8) {
@@ -374,13 +369,19 @@ void brw_init_state( struct brw_context *brw )
       num_atoms = ARRAY_SIZE(gen4_atoms);
    }
 
-   brw->atoms = atoms;
    brw->num_atoms = num_atoms;
 
+   /* This is to work around brw_context::atoms being declared const.  We want
+    * it to be const, but it needs to be initialized somehow!
+    */
+   struct brw_tracked_state *context_atoms =
+      (struct brw_tracked_state *) &brw->atoms[0];
+
+   for (int i = 0; i < num_atoms; i++)
+      context_atoms[i] = *atoms[i];
+
    while (num_atoms--) {
-      assert((*atoms)->dirty.mesa |
-            (*atoms)->dirty.brw |
-            (*atoms)->dirty.cache);
+      assert((*atoms)->dirty.mesa | (*atoms)->dirty.brw);
       assert((*atoms)->emit);
       atoms++;
    }
@@ -390,6 +391,11 @@ void brw_init_state( struct brw_context *brw )
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0ull;
 
+   /* ~0 is a nonsensical value which won't match anything we program, so
+    * the programming will take effect on the first time around.
+    */
+   brw->pma_stall_bits = ~0;
+
    /* Make sure that brw->state.dirty.brw has enough bits to hold all possible
     * dirty flags.
     */
@@ -399,6 +405,7 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewTransformFeedbackProg = BRW_NEW_TRANSFORM_FEEDBACK;
    ctx->DriverFlags.NewRasterizerDiscard = BRW_NEW_RASTERIZER_DISCARD;
    ctx->DriverFlags.NewUniformBuffer = BRW_NEW_UNIFORM_BUFFER;
+   ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
    ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
 }
 
@@ -414,9 +421,7 @@ void brw_destroy_state( struct brw_context *brw )
 static bool
 check_state(const struct brw_state_flags *a, const struct brw_state_flags *b)
 {
-   return ((a->mesa & b->mesa) |
-          (a->brw & b->brw) |
-          (a->cache & b->cache)) != 0;
+   return ((a->mesa & b->mesa) | (a->brw & b->brw)) != 0;
 }
 
 static void accumulate_state( struct brw_state_flags *a,
@@ -424,7 +429,6 @@ static void accumulate_state( struct brw_state_flags *a,
 {
    a->mesa |= b->mesa;
    a->brw |= b->brw;
-   a->cache |= b->cache;
 }
 
 
@@ -434,7 +438,6 @@ static void xor_states( struct brw_state_flags *result,
 {
    result->mesa = a->mesa ^ b->mesa;
    result->brw = a->brw ^ b->brw;
-   result->cache = a->cache ^ b->cache;
 }
 
 struct dirty_bit_map {
@@ -481,6 +484,13 @@ static struct dirty_bit_map mesa_bits[] = {
 };
 
 static struct dirty_bit_map brw_bits[] = {
+   DEFINE_BIT(BRW_NEW_FS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_BLORP_BLIT_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_SF_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_VS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_FF_GS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_GS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_CLIP_PROG_DATA),
    DEFINE_BIT(BRW_NEW_URB_FENCE),
    DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
    DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM),
@@ -514,31 +524,15 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_PUSH_CONSTANT_ALLOCATION),
    DEFINE_BIT(BRW_NEW_NUM_SAMPLES),
    DEFINE_BIT(BRW_NEW_TEXTURE_BUFFER),
+   DEFINE_BIT(BRW_NEW_GEN4_UNIT_STATE),
+   DEFINE_BIT(BRW_NEW_CC_VP),
+   DEFINE_BIT(BRW_NEW_SF_VP),
+   DEFINE_BIT(BRW_NEW_CLIP_VP),
+   DEFINE_BIT(BRW_NEW_SAMPLER_STATE_TABLE),
+   DEFINE_BIT(BRW_NEW_VS_ATTRIB_WORKAROUNDS),
    {0, 0, 0}
 };
 
-static struct dirty_bit_map cache_bits[] = {
-   DEFINE_BIT(CACHE_NEW_CC_VP),
-   DEFINE_BIT(CACHE_NEW_CC_UNIT),
-   DEFINE_BIT(CACHE_NEW_WM_PROG),
-   DEFINE_BIT(CACHE_NEW_BLORP_BLIT_PROG),
-   DEFINE_BIT(CACHE_NEW_SAMPLER),
-   DEFINE_BIT(CACHE_NEW_WM_UNIT),
-   DEFINE_BIT(CACHE_NEW_SF_PROG),
-   DEFINE_BIT(CACHE_NEW_SF_VP),
-   DEFINE_BIT(CACHE_NEW_SF_UNIT),
-   DEFINE_BIT(CACHE_NEW_VS_UNIT),
-   DEFINE_BIT(CACHE_NEW_VS_PROG),
-   DEFINE_BIT(CACHE_NEW_FF_GS_UNIT),
-   DEFINE_BIT(CACHE_NEW_FF_GS_PROG),
-   DEFINE_BIT(CACHE_NEW_GS_PROG),
-   DEFINE_BIT(CACHE_NEW_CLIP_VP),
-   DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
-   DEFINE_BIT(CACHE_NEW_CLIP_PROG),
-   {0, 0, 0}
-};
-
-
 static void
 brw_update_dirty_count(struct dirty_bit_map *bit_map, uint64_t bits)
 {
@@ -552,11 +546,26 @@ static void
 brw_print_dirty_count(struct dirty_bit_map *bit_map)
 {
    for (int i = 0; bit_map[i].bit != 0; i++) {
-      fprintf(stderr, "0x%016lx: %12d (%s)\n",
-             bit_map[i].bit, bit_map[i].count, bit_map[i].name);
+      if (bit_map[i].count > 1) {
+         fprintf(stderr, "0x%016lx: %12d (%s)\n",
+                 bit_map[i].bit, bit_map[i].count, bit_map[i].name);
+      }
    }
 }
 
+static void
+brw_upload_programs(struct brw_context *brw)
+{
+   brw_upload_vs_prog(brw);
+
+   if (brw->gen < 6)
+      brw_upload_ff_gs_prog(brw);
+   else
+      brw_upload_gs_prog(brw);
+
+   brw_upload_wm_prog(brw);
+}
+
 /***********************************************************************
  * Emit all state:
  */
@@ -577,7 +586,6 @@ void brw_upload_state(struct brw_context *brw)
       /* Always re-emit all state. */
       state->mesa |= ~0;
       state->brw |= ~0ull;
-      state->cache |= ~0;
    }
 
    if (brw->fragment_program != ctx->FragmentProgram._Current) {
@@ -605,9 +613,15 @@ void brw_upload_state(struct brw_context *brw)
       brw->state.dirty.brw |= BRW_NEW_NUM_SAMPLES;
    }
 
-   if ((state->mesa | state->cache | state->brw) == 0)
+   if ((state->mesa | state->brw) == 0)
       return;
 
+   /* Emit Sandybridge workaround flushes on every primitive, for safety. */
+   if (brw->gen == 6)
+      intel_emit_post_sync_nonzero_flush(brw);
+
+   brw_upload_programs(brw);
+
    if (unlikely(INTEL_DEBUG)) {
       /* Debug version which enforces various sanity checks on the
        * state flags which are generated and checked to help ensure
@@ -618,7 +632,7 @@ void brw_upload_state(struct brw_context *brw)
       prev = *state;
 
       for (i = 0; i < brw->num_atoms; i++) {
-        const struct brw_tracked_state *atom = brw->atoms[i];
+        const struct brw_tracked_state *atom = &brw->atoms[i];
         struct brw_state_flags generated;
 
         if (check_state(state, &atom->dirty)) {
@@ -638,7 +652,7 @@ void brw_upload_state(struct brw_context *brw)
    }
    else {
       for (i = 0; i < brw->num_atoms; i++) {
-        const struct brw_tracked_state *atom = brw->atoms[i];
+        const struct brw_tracked_state *atom = &brw->atoms[i];
 
         if (check_state(state, &atom->dirty)) {
            atom->emit(brw);
@@ -648,15 +662,12 @@ void brw_upload_state(struct brw_context *brw)
 
    if (unlikely(INTEL_DEBUG & DEBUG_STATE)) {
       STATIC_ASSERT(ARRAY_SIZE(brw_bits) == BRW_NUM_STATE_BITS + 1);
-      STATIC_ASSERT(ARRAY_SIZE(cache_bits) == BRW_MAX_CACHE + 1);
 
       brw_update_dirty_count(mesa_bits, state->mesa);
       brw_update_dirty_count(brw_bits, state->brw);
-      brw_update_dirty_count(cache_bits, state->cache);
       if (dirty_count++ % 1000 == 0) {
         brw_print_dirty_count(mesa_bits);
         brw_print_dirty_count(brw_bits);
-        brw_print_dirty_count(cache_bits);
         fprintf(stderr, "\n");
       }
    }