i965/nir: Sort uniforms direct-first and use two different uniform registers
[mesa.git] / src / mesa / drivers / dri / i965 / brw_state_upload.c
index 5ecebf5c015de82d5ea6c9cb23a9a6ed83d4b8b1..1b848593de654ba9fb22788abcf7ee165a7e1418 100644 (file)
 #include "drivers/common/meta.h"
 #include "intel_batchbuffer.h"
 #include "intel_buffers.h"
+#include "brw_vs.h"
+#include "brw_ff_gs.h"
+#include "brw_gs.h"
+#include "brw_wm.h"
 
 static const struct brw_tracked_state *gen4_atoms[] =
 {
-   &brw_vs_prog, /* must do before GS prog, state base address. */
-   &brw_ff_gs_prog, /* must do before state base address */
-
    &brw_interpolation_map,
 
    &brw_clip_prog, /* must do before state base address */
    &brw_sf_prog, /* must do before state base address */
-   &brw_wm_prog, /* must do before state base address */
 
    /* Once all the programs are done, we know how large urb entry
     * sizes need to be and can decide if we need to change the urb
@@ -98,7 +98,7 @@ static const struct brw_tracked_state *gen4_atoms[] =
    &brw_psp_urb_cbs,
 
    &brw_drawing_rect,
-   &brw_indices,
+   &brw_indices, /* must come before brw_vertices */
    &brw_index_buffer,
    &brw_vertices,
 
@@ -107,10 +107,6 @@ static const struct brw_tracked_state *gen4_atoms[] =
 
 static const struct brw_tracked_state *gen6_atoms[] =
 {
-   &brw_vs_prog, /* must do before state base address */
-   &brw_ff_gs_prog, /* must do before state base address */
-   &brw_wm_prog, /* must do before state base address */
-
    &gen6_clip_vp,
    &gen6_sf_vp,
 
@@ -128,6 +124,7 @@ static const struct brw_tracked_state *gen6_atoms[] =
    &gen6_depth_stencil_state,  /* must do before cc unit */
 
    &gen6_vs_push_constants, /* Before vs_state */
+   &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_state */
 
    /* Surface state setup.  Must come before the VS/WM unit.  The binding
@@ -135,6 +132,8 @@ static const struct brw_tracked_state *gen6_atoms[] =
     */
    &brw_vs_pull_constants,
    &brw_vs_ubo_surfaces,
+   &brw_gs_pull_constants,
+   &brw_gs_ubo_surfaces,
    &brw_wm_pull_constants,
    &brw_wm_ubo_surfaces,
    &gen6_renderbuffer_surfaces,
@@ -146,6 +145,7 @@ static const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_fs_samplers,
    &brw_vs_samplers,
+   &brw_gs_samplers,
    &gen6_sampler_state,
    &gen6_multisample_state,
 
@@ -169,24 +169,19 @@ static const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_drawing_rect,
 
-   &brw_indices,
+   &brw_indices, /* must come before brw_vertices */
    &brw_index_buffer,
    &brw_vertices,
 };
 
 static const struct brw_tracked_state *gen7_atoms[] =
 {
-   &brw_vs_prog,
-   &brw_gs_prog,
-   &brw_wm_prog,
-
    /* Command packets: */
 
    /* must do before binding table pointers, cc state ptrs */
    &brw_state_base_address,
 
    &brw_cc_vp,
-   &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
    &gen7_sf_clip_viewport,
 
    &gen7_push_constant_space,
@@ -196,7 +191,7 @@ static const struct brw_tracked_state *gen7_atoms[] =
    &gen6_depth_stencil_state,  /* must do before cc unit */
 
    &gen6_vs_push_constants, /* Before vs_state */
-   &gen7_gs_push_constants, /* Before gs_state */
+   &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
 
    /* Surface state setup.  Must come before the VS/WM unit.  The binding
@@ -244,7 +239,7 @@ static const struct brw_tracked_state *gen7_atoms[] =
 
    &brw_drawing_rect,
 
-   &brw_indices,
+   &brw_indices, /* must come before brw_vertices */
    &brw_index_buffer,
    &brw_vertices,
 
@@ -253,15 +248,10 @@ static const struct brw_tracked_state *gen7_atoms[] =
 
 static const struct brw_tracked_state *gen8_atoms[] =
 {
-   &brw_vs_prog,
-   &brw_gs_prog,
-   &brw_wm_prog,
-
    /* Command packets: */
    &gen8_state_base_address,
 
    &brw_cc_vp,
-   &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
    &gen8_sf_clip_viewport,
 
    &gen7_push_constant_space,
@@ -270,7 +260,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
    &gen6_color_calc_state,
 
    &gen6_vs_push_constants, /* Before vs_state */
-   &gen7_gs_push_constants, /* Before gs_state */
+   &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
 
    /* Surface state setup.  Must come before the VS/WM unit.  The binding
@@ -298,7 +288,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
 
    &gen8_disable_stages,
    &gen8_vs_state,
-   &gen7_gs_state,
+   &gen8_gs_state,
    &gen8_sol_state,
    &gen6_clip_state,
    &gen8_raster_state,
@@ -329,6 +319,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
    &gen8_vertices,
 
    &haswell_cut_index,
+   &gen8_pma_fix,
 };
 
 static void
@@ -341,6 +332,9 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
    if (!brw->hw_ctx)
       return;
 
+   if (brw->gen == 6)
+      intel_emit_post_sync_nonzero_flush(brw);
+
    brw_upload_invariant_state(brw);
 
    if (brw->gen >= 8) {
@@ -354,6 +348,11 @@ void brw_init_state( struct brw_context *brw )
    const struct brw_tracked_state **atoms;
    int num_atoms;
 
+   STATIC_ASSERT(ARRAY_SIZE(gen4_atoms) <= ARRAY_SIZE(brw->atoms));
+   STATIC_ASSERT(ARRAY_SIZE(gen6_atoms) <= ARRAY_SIZE(brw->atoms));
+   STATIC_ASSERT(ARRAY_SIZE(gen7_atoms) <= ARRAY_SIZE(brw->atoms));
+   STATIC_ASSERT(ARRAY_SIZE(gen8_atoms) <= ARRAY_SIZE(brw->atoms));
+
    brw_init_caches(brw);
 
    if (brw->gen >= 8) {
@@ -370,13 +369,19 @@ void brw_init_state( struct brw_context *brw )
       num_atoms = ARRAY_SIZE(gen4_atoms);
    }
 
-   brw->atoms = atoms;
    brw->num_atoms = num_atoms;
 
+   /* This is to work around brw_context::atoms being declared const.  We want
+    * it to be const, but it needs to be initialized somehow!
+    */
+   struct brw_tracked_state *context_atoms =
+      (struct brw_tracked_state *) &brw->atoms[0];
+
+   for (int i = 0; i < num_atoms; i++)
+      context_atoms[i] = *atoms[i];
+
    while (num_atoms--) {
-      assert((*atoms)->dirty.mesa |
-            (*atoms)->dirty.brw |
-            (*atoms)->dirty.cache);
+      assert((*atoms)->dirty.mesa | (*atoms)->dirty.brw);
       assert((*atoms)->emit);
       atoms++;
    }
@@ -384,7 +389,12 @@ void brw_init_state( struct brw_context *brw )
    brw_upload_initial_gpu_state(brw);
 
    brw->state.dirty.mesa = ~0;
-   brw->state.dirty.brw = ~0;
+   brw->state.dirty.brw = ~0ull;
+
+   /* ~0 is a nonsensical value which won't match anything we program, so
+    * the programming will take effect on the first time around.
+    */
+   brw->pma_stall_bits = ~0;
 
    /* Make sure that brw->state.dirty.brw has enough bits to hold all possible
     * dirty flags.
@@ -395,6 +405,7 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewTransformFeedbackProg = BRW_NEW_TRANSFORM_FEEDBACK;
    ctx->DriverFlags.NewRasterizerDiscard = BRW_NEW_RASTERIZER_DISCARD;
    ctx->DriverFlags.NewUniformBuffer = BRW_NEW_UNIFORM_BUFFER;
+   ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
    ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
 }
 
@@ -410,9 +421,7 @@ void brw_destroy_state( struct brw_context *brw )
 static bool
 check_state(const struct brw_state_flags *a, const struct brw_state_flags *b)
 {
-   return ((a->mesa & b->mesa) |
-          (a->brw & b->brw) |
-          (a->cache & b->cache)) != 0;
+   return ((a->mesa & b->mesa) | (a->brw & b->brw)) != 0;
 }
 
 static void accumulate_state( struct brw_state_flags *a,
@@ -420,7 +429,6 @@ static void accumulate_state( struct brw_state_flags *a,
 {
    a->mesa |= b->mesa;
    a->brw |= b->brw;
-   a->cache |= b->cache;
 }
 
 
@@ -430,11 +438,10 @@ static void xor_states( struct brw_state_flags *result,
 {
    result->mesa = a->mesa ^ b->mesa;
    result->brw = a->brw ^ b->brw;
-   result->cache = a->cache ^ b->cache;
 }
 
 struct dirty_bit_map {
-   uint32_t bit;
+   uint64_t bit;
    char *name;
    uint32_t count;
 };
@@ -471,11 +478,19 @@ static struct dirty_bit_map mesa_bits[] = {
    DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
    DEFINE_BIT(_NEW_BUFFER_OBJECT),
    DEFINE_BIT(_NEW_FRAG_CLAMP),
-   DEFINE_BIT(_NEW_VARYING_VP_INPUTS),
+   /* Avoid sign extension problems. */
+   {(unsigned) _NEW_VARYING_VP_INPUTS, "_NEW_VARYING_VP_INPUTS", 0},
    {0, 0, 0}
 };
 
 static struct dirty_bit_map brw_bits[] = {
+   DEFINE_BIT(BRW_NEW_FS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_BLORP_BLIT_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_SF_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_VS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_FF_GS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_GS_PROG_DATA),
+   DEFINE_BIT(BRW_NEW_CLIP_PROG_DATA),
    DEFINE_BIT(BRW_NEW_URB_FENCE),
    DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
    DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM),
@@ -507,41 +522,21 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_META_IN_PROGRESS),
    DEFINE_BIT(BRW_NEW_INTERPOLATION_MAP),
    DEFINE_BIT(BRW_NEW_PUSH_CONSTANT_ALLOCATION),
+   DEFINE_BIT(BRW_NEW_NUM_SAMPLES),
+   DEFINE_BIT(BRW_NEW_TEXTURE_BUFFER),
+   DEFINE_BIT(BRW_NEW_GEN4_UNIT_STATE),
+   DEFINE_BIT(BRW_NEW_CC_VP),
+   DEFINE_BIT(BRW_NEW_SF_VP),
+   DEFINE_BIT(BRW_NEW_CLIP_VP),
+   DEFINE_BIT(BRW_NEW_SAMPLER_STATE_TABLE),
+   DEFINE_BIT(BRW_NEW_VS_ATTRIB_WORKAROUNDS),
    {0, 0, 0}
 };
 
-static struct dirty_bit_map cache_bits[] = {
-   DEFINE_BIT(CACHE_NEW_CC_VP),
-   DEFINE_BIT(CACHE_NEW_CC_UNIT),
-   DEFINE_BIT(CACHE_NEW_WM_PROG),
-   DEFINE_BIT(CACHE_NEW_BLORP_BLIT_PROG),
-   DEFINE_BIT(CACHE_NEW_BLORP_CONST_COLOR_PROG),
-   DEFINE_BIT(CACHE_NEW_SAMPLER),
-   DEFINE_BIT(CACHE_NEW_WM_UNIT),
-   DEFINE_BIT(CACHE_NEW_SF_PROG),
-   DEFINE_BIT(CACHE_NEW_SF_VP),
-   DEFINE_BIT(CACHE_NEW_SF_UNIT),
-   DEFINE_BIT(CACHE_NEW_VS_UNIT),
-   DEFINE_BIT(CACHE_NEW_VS_PROG),
-   DEFINE_BIT(CACHE_NEW_FF_GS_UNIT),
-   DEFINE_BIT(CACHE_NEW_FF_GS_PROG),
-   DEFINE_BIT(CACHE_NEW_GS_PROG),
-   DEFINE_BIT(CACHE_NEW_CLIP_VP),
-   DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
-   DEFINE_BIT(CACHE_NEW_CLIP_PROG),
-   {0, 0, 0}
-};
-
-
 static void
-brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+brw_update_dirty_count(struct dirty_bit_map *bit_map, uint64_t bits)
 {
-   int i;
-
-   for (i = 0; i < 32; i++) {
-      if (bit_map[i].bit == 0)
-        return;
-
+   for (int i = 0; bit_map[i].bit != 0; i++) {
       if (bit_map[i].bit & bits)
         bit_map[i].count++;
    }
@@ -550,15 +545,25 @@ brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
 static void
 brw_print_dirty_count(struct dirty_bit_map *bit_map)
 {
-   int i;
+   for (int i = 0; bit_map[i].bit != 0; i++) {
+      if (bit_map[i].count > 1) {
+         fprintf(stderr, "0x%016lx: %12d (%s)\n",
+                 bit_map[i].bit, bit_map[i].count, bit_map[i].name);
+      }
+   }
+}
 
-   for (i = 0; i < 32; i++) {
-      if (bit_map[i].bit == 0)
-        return;
+static void
+brw_upload_programs(struct brw_context *brw)
+{
+   brw_upload_vs_prog(brw);
 
-      fprintf(stderr, "0x%08x: %12d (%s)\n",
-             bit_map[i].bit, bit_map[i].count, bit_map[i].name);
-   }
+   if (brw->gen < 6)
+      brw_upload_ff_gs_prog(brw);
+   else
+      brw_upload_gs_prog(brw);
+
+   brw_upload_wm_prog(brw);
 }
 
 /***********************************************************************
@@ -580,8 +585,7 @@ void brw_upload_state(struct brw_context *brw)
    if (0) {
       /* Always re-emit all state. */
       state->mesa |= ~0;
-      state->brw |= ~0;
-      state->cache |= ~0;
+      state->brw |= ~0ull;
    }
 
    if (brw->fragment_program != ctx->FragmentProgram._Current) {
@@ -604,10 +608,19 @@ void brw_upload_state(struct brw_context *brw)
       brw->state.dirty.brw |= BRW_NEW_META_IN_PROGRESS;
    }
 
-   if ((state->mesa | state->cache | state->brw) == 0)
+   if (brw->num_samples != ctx->DrawBuffer->Visual.samples) {
+      brw->num_samples = ctx->DrawBuffer->Visual.samples;
+      brw->state.dirty.brw |= BRW_NEW_NUM_SAMPLES;
+   }
+
+   if ((state->mesa | state->brw) == 0)
       return;
 
-   intel_check_front_buffer_rendering(brw);
+   /* Emit Sandybridge workaround flushes on every primitive, for safety. */
+   if (brw->gen == 6)
+      intel_emit_post_sync_nonzero_flush(brw);
+
+   brw_upload_programs(brw);
 
    if (unlikely(INTEL_DEBUG)) {
       /* Debug version which enforces various sanity checks on the
@@ -619,7 +632,7 @@ void brw_upload_state(struct brw_context *brw)
       prev = *state;
 
       for (i = 0; i < brw->num_atoms; i++) {
-        const struct brw_tracked_state *atom = brw->atoms[i];
+        const struct brw_tracked_state *atom = &brw->atoms[i];
         struct brw_state_flags generated;
 
         if (check_state(state, &atom->dirty)) {
@@ -639,7 +652,7 @@ void brw_upload_state(struct brw_context *brw)
    }
    else {
       for (i = 0; i < brw->num_atoms; i++) {
-        const struct brw_tracked_state *atom = brw->atoms[i];
+        const struct brw_tracked_state *atom = &brw->atoms[i];
 
         if (check_state(state, &atom->dirty)) {
            atom->emit(brw);
@@ -649,15 +662,12 @@ void brw_upload_state(struct brw_context *brw)
 
    if (unlikely(INTEL_DEBUG & DEBUG_STATE)) {
       STATIC_ASSERT(ARRAY_SIZE(brw_bits) == BRW_NUM_STATE_BITS + 1);
-      STATIC_ASSERT(ARRAY_SIZE(cache_bits) == BRW_MAX_CACHE + 1);
 
       brw_update_dirty_count(mesa_bits, state->mesa);
       brw_update_dirty_count(brw_bits, state->brw);
-      brw_update_dirty_count(cache_bits, state->cache);
       if (dirty_count++ % 1000 == 0) {
         brw_print_dirty_count(mesa_bits);
         brw_print_dirty_count(brw_bits);
-        brw_print_dirty_count(cache_bits);
         fprintf(stderr, "\n");
       }
    }