i965: Free serialized nir after deserializing

[mesa.git] / src / mesa / drivers / dri / i965 / brw_program.c
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c

index c11ac871e58e7c43b60a5f92f26ea06a319005c1..30cc14e88a25543d1d02d4b1cd4c8f3b192b6aeb 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -40,6 +40,7 @@
  #include "util/ralloc.h"
  #include "compiler/glsl/ir.h"
  #include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir/nir_serialize.h"
  
  #include "brw_program.h"
  #include "brw_context.h"
@@ -71,7 +72,6 @@ brw_create_nir(struct brw_context *brw,
     struct gl_context *ctx = &brw->ctx;
     const nir_shader_compiler_options *options =
        ctx->Const.ShaderCompilerOptions[stage].NirOptions;
-   bool progress;
     nir_shader *nir;
  
     /* First, lower the GLSL IR or Mesa IR to NIR */
@@ -88,8 +88,6 @@ brw_create_nir(struct brw_context *brw,
     }
     nir_validate_shader(nir);
  
-   (void)progress;
-
     nir = brw_preprocess_nir(brw->screen->compiler, nir);
  
     if (stage == MESA_SHADER_FRAGMENT) {
@@ -98,13 +96,15 @@ brw_create_nir(struct brw_context *brw,
           .fs_coord_pixel_center_integer = 1,
           .fs_coord_origin_upper_left = 1,
        };
-      _mesa_add_state_reference(prog->Parameters,
-                                (gl_state_index *) wpos_options.state_tokens);
  
+      bool progress = false;
        NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
+      if (progress) {
+         _mesa_add_state_reference(prog->Parameters,
+                                   (gl_state_index *) wpos_options.state_tokens);
+      }
     }
  
-   NIR_PASS(progress, nir, nir_lower_system_values);
     NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
  
     return nir;
@@ -329,19 +329,77 @@ brw_get_scratch_bo(struct brw_context *brw,
  void
  brw_alloc_stage_scratch(struct brw_context *brw,
                          struct brw_stage_state *stage_state,
-                        unsigned per_thread_size,
-                        unsigned thread_count)
+                        unsigned per_thread_size)
  {
-   if (stage_state->per_thread_scratch < per_thread_size) {
-      stage_state->per_thread_scratch = per_thread_size;
+   if (stage_state->per_thread_scratch >= per_thread_size)
+      return;
+
+   stage_state->per_thread_scratch = per_thread_size;
  
-      if (stage_state->scratch_bo)
-         brw_bo_unreference(stage_state->scratch_bo);
+   if (stage_state->scratch_bo)
+      brw_bo_unreference(stage_state->scratch_bo);
+
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   unsigned thread_count;
+   switch(stage_state->stage) {
+   case MESA_SHADER_VERTEX:
+      thread_count = devinfo->max_vs_threads;
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      thread_count = devinfo->max_tcs_threads;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      thread_count = devinfo->max_tes_threads;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      thread_count = devinfo->max_gs_threads;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      thread_count = devinfo->max_wm_threads;
+      break;
+   case MESA_SHADER_COMPUTE: {
+      unsigned subslices = MAX2(brw->screen->subslice_total, 1);
+
+      /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
+       *
+       * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+       *  allocate scratch space enough so that each slice has 4 slices
+       *  allowed."
+       *
+       * According to the other driver team, this applies to compute shaders
+       * as well.  This is not currently documented at all.
+       */
+      if (devinfo->gen >= 9)
+         subslices = 4;
+
+      /* WaCSScratchSize:hsw
+       *
+       * Haswell's scratch space address calculation appears to be sparse
+       * rather than tightly packed.  The Thread ID has bits indicating
+       * which subslice, EU within a subslice, and thread within an EU
+       * it is.  There's a maximum of two slices and two subslices, so these
+       * can be stored with a single bit.  Even though there are only 10 EUs
+       * per subslice, this is stored in 4 bits, so there's an effective
+       * maximum value of 16 EUs.  Similarly, although there are only 7
+       * threads per EU, this is stored in a 3 bit number, giving an effective
+       * maximum value of 8 threads per EU.
+       *
+       * This means that we need to use 16 * 8 instead of 10 * 7 for the
+       * number of threads per subslice.
+       */
+      const unsigned scratch_ids_per_subslice =
+         devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
  
-      stage_state->scratch_bo =
-         brw_bo_alloc(brw->bufmgr, "shader scratch space",
-                      per_thread_size * thread_count, 4096);
+      thread_count = scratch_ids_per_subslice * subslices;
+      break;
     }
+   default:
+      unreachable("Unsupported stage!");
+   }
+
+   stage_state->scratch_bo =
+      brw_bo_alloc(brw->bufmgr, "shader scratch space",
+                   per_thread_size * thread_count, 4096);
  }
  
  void brwInitFragProgFuncs( struct dd_function_table *functions )
@@ -618,7 +676,6 @@ brw_stage_prog_data_free(const void *p)
  
     ralloc_free(prog_data->param);
     ralloc_free(prog_data->pull_param);
-   ralloc_free(prog_data->image_param);
  }
  
  void
@@ -676,10 +733,11 @@ brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
        stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
     }
  
-   if (prog->info.num_ssbos) {
+   if (prog->info.num_ssbos || prog->info.num_abos) {
+      assert(prog->info.num_abos <= BRW_MAX_ABO);
        assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
        stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
-      next_binding_table_offset += prog->info.num_ssbos;
+      next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
     } else {
        stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
     }
@@ -691,7 +749,7 @@ brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
        stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
     }
  
-   if (prog->nir->info.uses_texture_gather) {
+   if (prog->info.uses_texture_gather) {
        if (devinfo->gen >= 8) {
           stage_prog_data->binding_table.gather_texture_start =
              stage_prog_data->binding_table.texture_start;
@@ -703,13 +761,6 @@ brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
        stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
     }
  
-   if (prog->info.num_abos) {
-      stage_prog_data->binding_table.abo_start = next_binding_table_offset;
-      next_binding_table_offset += prog->info.num_abos;
-   } else {
-      stage_prog_data->binding_table.abo_start = 0xd0d0d0d0;
-   }
-
     if (prog->info.num_images) {
        stage_prog_data->binding_table.image_start = next_binding_table_offset;
        next_binding_table_offset += prog->info.num_images;
@@ -735,3 +786,24 @@ brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
     assert(next_binding_table_offset <= BRW_MAX_SURFACES);
     return next_binding_table_offset;
  }
+
+void
+brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
+                            gl_shader_stage stage)
+{
+   if (!prog->nir) {
+      assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0);
+      const struct nir_shader_compiler_options *options =
+         ctx->Const.ShaderCompilerOptions[stage].NirOptions;
+      struct blob_reader reader;
+      blob_reader_init(&reader, prog->driver_cache_blob,
+                       prog->driver_cache_blob_size);
+      prog->nir = nir_deserialize(NULL, options, &reader);
+   }
+
+   if (prog->driver_cache_blob) {
+      ralloc_free(prog->driver_cache_blob);
+      prog->driver_cache_blob = NULL;
+      prog->driver_cache_blob_size = 0;
+   }
+}