mesa: Rename MESA_shader_framebuffer_fetch gl_extensions bits to EXT.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program.c
index cb73f424f33479b20f1331ae1f71c879c10860e8..527f003977b0590e1963e900239bc23e7c00bf7a 100644 (file)
 #include "util/ralloc.h"
 #include "compiler/glsl/ir.h"
 #include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir/nir_serialize.h"
 
 #include "brw_program.h"
 #include "brw_context.h"
-#include "brw_shader.h"
-#include "brw_nir.h"
+#include "compiler/brw_nir.h"
+#include "brw_defines.h"
 #include "intel_batchbuffer.h"
 
-static void
+static bool
 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
 {
    if (is_scalar) {
-      nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, 0,
+      nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
                                type_size_scalar_bytes);
-      nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
+      return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
    } else {
-      nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, 0,
+      nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
                                type_size_vec4_bytes);
-      nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
+      return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
    }
 }
 
 nir_shader *
 brw_create_nir(struct brw_context *brw,
                const struct gl_shader_program *shader_prog,
-               const struct gl_program *prog,
+               struct gl_program *prog,
                gl_shader_stage stage,
                bool is_scalar)
 {
    struct gl_context *ctx = &brw->ctx;
    const nir_shader_compiler_options *options =
       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
-   bool progress;
    nir_shader *nir;
 
    /* First, lower the GLSL IR or Mesa IR to NIR */
    if (shader_prog) {
       nir = glsl_to_nir(shader_prog, stage, options);
       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
+      nir_lower_returns(nir);
+      nir_validate_shader(nir);
       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
                  nir_shader_get_entrypoint(nir), true, false);
    } else {
       nir = prog_to_nir(prog, options);
-      NIR_PASS_V(nir, nir_convert_to_ssa); /* turn registers into SSA */
+      NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
    }
    nir_validate_shader(nir);
 
-   (void)progress;
+   /* Lower PatchVerticesIn from system value to uniform. This needs to
+    * happen before brw_preprocess_nir, since that will lower system values
+    * to intrinsics.
+    *
+    * We only do this for TES if no TCS is present, since otherwise we know
+    * the number of vertices in the patch at link time and we can lower it
+    * directly to a constant. We do this in nir_lower_patch_vertices, which
+    * needs to run after brw_nir_preprocess has turned the system values
+    * into intrinsics.
+    */
+   const bool lower_patch_vertices_in_to_uniform =
+      (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) ||
+      (stage == MESA_SHADER_TESS_EVAL &&
+       !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
+
+   if (lower_patch_vertices_in_to_uniform)
+      brw_nir_lower_patch_vertices_in_to_uniform(nir);
 
    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 
+   if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) {
+      assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
+      struct gl_linked_shader *linked_tcs =
+         shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
+      uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out;
+      nir_lower_tes_patch_vertices(nir, patch_vertices);
+   }
+
    if (stage == MESA_SHADER_FRAGMENT) {
       static const struct nir_lower_wpos_ytransform_options wpos_options = {
          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
          .fs_coord_pixel_center_integer = 1,
          .fs_coord_origin_upper_left = 1,
       };
-      _mesa_add_state_reference(prog->Parameters,
-                                (gl_state_index *) wpos_options.state_tokens);
 
+      bool progress = false;
       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
+      if (progress) {
+         _mesa_add_state_reference(prog->Parameters,
+                                   wpos_options.state_tokens);
+      }
    }
 
-   NIR_PASS(progress, nir, nir_lower_system_values);
    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 
-   if (shader_prog) {
-      NIR_PASS_V(nir, nir_lower_samplers, shader_prog);
-      NIR_PASS_V(nir, nir_lower_atomics, shader_prog);
-   }
-
    return nir;
 }
 
+void
+brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
+{
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   /* Copy the info we just generated back into the gl_program */
+   const char *prog_name = prog->info.name;
+   const char *prog_label = prog->info.label;
+   prog->info = nir->info;
+   prog->info.name = prog_name;
+   prog->info.label = prog_label;
+}
+
 static unsigned
 get_new_program_id(struct intel_screen *screen)
 {
-   static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
-   pthread_mutex_lock(&m);
-   unsigned id = screen->program_id++;
-   pthread_mutex_unlock(&m);
-   return id;
+   return p_atomic_inc_return(&screen->program_id);
 }
 
-static struct gl_program *brwNewProgram( struct gl_context *ctx,
-                                     GLenum target,
-                                     GLuint id )
+static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
+                                        GLuint id, bool is_arb_asm)
 {
    struct brw_context *brw = brw_context(ctx);
+   struct brw_program *prog = rzalloc(NULL, struct brw_program);
 
-   switch (target) {
-   case GL_VERTEX_PROGRAM_ARB:
-   case GL_GEOMETRY_PROGRAM_NV: {
-      struct brw_program *prog = CALLOC_STRUCT(brw_program);
-      if (prog) {
-        prog->id = get_new_program_id(brw->screen);
-
-        return _mesa_init_gl_program(&prog->program, target, id);
-      }
-      else
-        return NULL;
-   }
-
-   case GL_FRAGMENT_PROGRAM_ARB: {
-      struct brw_fragment_program *prog;
-      if (brw->gen < 6) {
-         struct gen4_fragment_program *g4_prog =
-            CALLOC_STRUCT(gen4_fragment_program);
-         prog = &g4_prog->base;
-      } else {
-         prog = CALLOC_STRUCT(brw_fragment_program);
-      }
-
-      if (prog) {
-        prog->id = get_new_program_id(brw->screen);
-
-        return _mesa_init_gl_program(&prog->program, target, id);
-      }
-      else
-        return NULL;
-   }
-
-   case GL_TESS_CONTROL_PROGRAM_NV: {
-      struct brw_tess_ctrl_program *prog = CALLOC_STRUCT(brw_tess_ctrl_program);
-      if (prog) {
-         prog->id = get_new_program_id(brw->screen);
-
-         return _mesa_init_gl_program(&prog->program, target, id);
-      } else {
-         return NULL;
-      }
-   }
-
-   case GL_TESS_EVALUATION_PROGRAM_NV: {
-      struct brw_tess_eval_program *prog = CALLOC_STRUCT(brw_tess_eval_program);
-      if (prog) {
-         prog->id = get_new_program_id(brw->screen);
+   if (prog) {
+      prog->id = get_new_program_id(brw->screen);
 
-         return _mesa_init_gl_program(&prog->program, target, id);
-      } else {
-         return NULL;
-      }
-   }
-
-   case GL_COMPUTE_PROGRAM_NV: {
-      struct brw_compute_program *prog = CALLOC_STRUCT(brw_compute_program);
-      if (prog) {
-         prog->id = get_new_program_id(brw->screen);
-
-         return _mesa_init_gl_program(&prog->program, target, id);
-      } else {
-         return NULL;
-      }
+      return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
    }
 
-   default:
-      unreachable("Unsupported target in brwNewProgram()");
-   }
+   return NULL;
 }
 
 static void brwDeleteProgram( struct gl_context *ctx,
                              struct gl_program *prog )
 {
+   struct brw_context *brw = brw_context(ctx);
+
+   /* Beware!  prog's refcount has reached zero, and it's about to be freed.
+    *
+    * In brw_upload_pipeline_state(), we compare brw->programs[i] to
+    * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
+    * pointer has changed.
+    *
+    * We cannot leave brw->programs[i] as a dangling pointer to the dead
+    * program.  malloc() may allocate the same memory for a new gl_program,
+    * causing us to see matching pointers...but totally different programs.
+    *
+    * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
+    * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
+    * would cause us to see matching pointers (NULL == NULL), and fail to
+    * detect that a program has changed since our last draw.
+    *
+    * So, set it to a bogus gl_program pointer that will never match,
+    * causing us to properly reevaluate the state on our next draw.
+    *
+    * Getting this wrong causes heisenbugs which are very hard to catch,
+    * as you need a very specific allocation pattern to hit the problem.
+    */
+   static const struct gl_program deleted_program;
+
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (brw->programs[i] == prog)
+         brw->programs[i] = (struct gl_program *) &deleted_program;
+   }
+
    _mesa_delete_program( ctx, prog );
 }
 
@@ -211,36 +212,36 @@ brwProgramStringNotify(struct gl_context *ctx,
                       GLenum target,
                       struct gl_program *prog)
 {
-   assert(target == GL_VERTEX_PROGRAM_ARB || !prog->IsPositionInvariant);
+   assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 
    struct brw_context *brw = brw_context(ctx);
    const struct brw_compiler *compiler = brw->screen->compiler;
 
    switch (target) {
    case GL_FRAGMENT_PROGRAM_ARB: {
-      struct brw_fragment_program *newFP = brw_fragment_program(prog);
-      const struct brw_fragment_program *curFP =
-         brw_fragment_program_const(brw->fragment_program);
+      struct brw_program *newFP = brw_program(prog);
+      const struct brw_program *curFP =
+         brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 
       if (newFP == curFP)
         brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
       newFP->id = get_new_program_id(brw->screen);
 
-      brw_add_texrect_params(prog);
-
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 
-      brw_fs_precompile(ctx, NULL, prog);
+      brw_shader_gather_info(prog->nir, prog);
+
+      brw_fs_precompile(ctx, prog);
       break;
    }
    case GL_VERTEX_PROGRAM_ARB: {
       struct brw_program *newVP = brw_program(prog);
       const struct brw_program *curVP =
-         brw_program_const(brw->vertex_program);
+         brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 
       if (newVP == curVP)
         brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
-      if (newVP->program.IsPositionInvariant) {
+      if (newVP->program.arb.IsPositionInvariant) {
         _mesa_insert_mvp_code(ctx, &newVP->program);
       }
       newVP->id = get_new_program_id(brw->screen);
@@ -249,12 +250,12 @@ brwProgramStringNotify(struct gl_context *ctx,
        */
       _tnl_program_string(ctx, target, prog);
 
-      brw_add_texrect_params(prog);
-
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 
-      brw_vs_precompile(ctx, NULL, prog);
+      brw_shader_gather_info(prog->nir, prog);
+
+      brw_vs_precompile(ctx, prog);
       break;
    }
    default:
@@ -275,10 +276,11 @@ static void
 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 {
    struct brw_context *brw = brw_context(ctx);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    unsigned bits = (PIPE_CONTROL_DATA_CACHE_FLUSH |
                     PIPE_CONTROL_NO_WRITE |
                     PIPE_CONTROL_CS_STALL);
-   assert(brw->gen >= 7 && brw->gen <= 9);
+   assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 
    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
                    GL_ELEMENT_ARRAY_BARRIER_BIT |
@@ -292,29 +294,32 @@ brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 
-   if (barriers & GL_TEXTURE_UPDATE_BARRIER_BIT)
-      bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
+   if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
+                   GL_PIXEL_BUFFER_BARRIER_BIT))
+      bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+               PIPE_CONTROL_RENDER_TARGET_FLUSH);
 
    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
-      bits |= (PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+      bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 
    /* Typed surface messages are handled by the render cache on IVB, so we
     * need to flush it too.
     */
-   if (brw->gen == 7 && !brw->is_haswell)
+   if (devinfo->gen == 7 && !devinfo->is_haswell)
       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 
    brw_emit_pipe_control_flush(brw, bits);
 }
 
 static void
-brw_blend_barrier(struct gl_context *ctx)
+brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   if (!ctx->Extensions.MESA_shader_framebuffer_fetch) {
-      if (brw->gen >= 6) {
+   if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
+      if (devinfo->gen >= 6) {
          brw_emit_pipe_control_flush(brw,
                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
                                      PIPE_CONTROL_CS_STALL);
@@ -327,38 +332,19 @@ brw_blend_barrier(struct gl_context *ctx)
    }
 }
 
-void
-brw_add_texrect_params(struct gl_program *prog)
-{
-   for (int texunit = 0; texunit < BRW_MAX_TEX_UNIT; texunit++) {
-      if (!(prog->TexturesUsed[texunit] & (1 << TEXTURE_RECT_INDEX)))
-         continue;
-
-      int tokens[STATE_LENGTH] = {
-         STATE_INTERNAL,
-         STATE_TEXRECT_SCALE,
-         texunit,
-         0,
-         0
-      };
-
-      _mesa_add_state_reference(prog->Parameters, (gl_state_index *)tokens);
-   }
-}
-
 void
 brw_get_scratch_bo(struct brw_context *brw,
-                  drm_intel_bo **scratch_bo, int size)
+                  struct brw_bo **scratch_bo, int size)
 {
-   drm_intel_bo *old_bo = *scratch_bo;
+   struct brw_bo *old_bo = *scratch_bo;
 
    if (old_bo && old_bo->size < size) {
-      drm_intel_bo_unreference(old_bo);
+      brw_bo_unreference(old_bo);
       old_bo = NULL;
    }
 
    if (!old_bo) {
-      *scratch_bo = drm_intel_bo_alloc(brw->bufmgr, "scratch bo", size, 4096);
+      *scratch_bo = brw_bo_alloc(brw->bufmgr, "scratch bo", size, 4096);
    }
 }
 
@@ -369,19 +355,81 @@ brw_get_scratch_bo(struct brw_context *brw,
 void
 brw_alloc_stage_scratch(struct brw_context *brw,
                         struct brw_stage_state *stage_state,
-                        unsigned per_thread_size,
-                        unsigned thread_count)
+                        unsigned per_thread_size)
 {
-   if (stage_state->per_thread_scratch < per_thread_size) {
-      stage_state->per_thread_scratch = per_thread_size;
+   if (stage_state->per_thread_scratch >= per_thread_size)
+      return;
+
+   stage_state->per_thread_scratch = per_thread_size;
+
+   if (stage_state->scratch_bo)
+      brw_bo_unreference(stage_state->scratch_bo);
 
-      if (stage_state->scratch_bo)
-         drm_intel_bo_unreference(stage_state->scratch_bo);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   unsigned thread_count;
+   switch(stage_state->stage) {
+   case MESA_SHADER_VERTEX:
+      thread_count = devinfo->max_vs_threads;
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      thread_count = devinfo->max_tcs_threads;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      thread_count = devinfo->max_tes_threads;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      thread_count = devinfo->max_gs_threads;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      thread_count = devinfo->max_wm_threads;
+      break;
+   case MESA_SHADER_COMPUTE: {
+      unsigned subslices = MAX2(brw->screen->subslice_total, 1);
+
+      /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
+       *
+       * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+       *  allocate scratch space enough so that each slice has 4 slices
+       *  allowed."
+       *
+       * According to the other driver team, this applies to compute shaders
+       * as well.  This is not currently documented at all.
+       *
+       * brw->screen->subslice_total is the TOTAL number of subslices
+       * and we wish to view that there are 4 subslices per slice
+       * instead of the actual number of subslices per slice.
+       */
+      if (devinfo->gen >= 9)
+         subslices = 4 * brw->screen->devinfo.num_slices;
+
+      /* WaCSScratchSize:hsw
+       *
+       * Haswell's scratch space address calculation appears to be sparse
+       * rather than tightly packed.  The Thread ID has bits indicating
+       * which subslice, EU within a subslice, and thread within an EU
+       * it is.  There's a maximum of two slices and two subslices, so these
+       * can be stored with a single bit.  Even though there are only 10 EUs
+       * per subslice, this is stored in 4 bits, so there's an effective
+       * maximum value of 16 EUs.  Similarly, although there are only 7
+       * threads per EU, this is stored in a 3 bit number, giving an effective
+       * maximum value of 8 threads per EU.
+       *
+       * This means that we need to use 16 * 8 instead of 10 * 7 for the
+       * number of threads per subslice.
+       */
+      const unsigned scratch_ids_per_subslice =
+         devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
 
-      stage_state->scratch_bo =
-         drm_intel_bo_alloc(brw->bufmgr, "shader scratch space",
-                            per_thread_size * thread_count, 4096);
+      thread_count = scratch_ids_per_subslice * subslices;
+      break;
    }
+   default:
+      unreachable("Unsupported stage!");
+   }
+
+   stage_state->scratch_bo =
+      brw_bo_alloc(brw->bufmgr, "shader scratch space",
+                   per_thread_size * thread_count, 4096);
 }
 
 void brwInitFragProgFuncs( struct dd_function_table *functions )
@@ -392,11 +440,10 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
    functions->DeleteProgram = brwDeleteProgram;
    functions->ProgramStringNotify = brwProgramStringNotify;
 
-   functions->NewShader = brw_new_shader;
    functions->LinkShader = brw_link_shader;
 
    functions->MemoryBarrier = brw_memory_barrier;
-   functions->BlendBarrier = brw_blend_barrier;
+   functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 }
 
 struct shader_times {
@@ -410,8 +457,8 @@ brw_init_shader_time(struct brw_context *brw)
 {
    const int max_entries = 2048;
    brw->shader_time.bo =
-      drm_intel_bo_alloc(brw->bufmgr, "shader time",
-                         max_entries * SHADER_TIME_STRIDE * 3, 4096);
+      brw_bo_alloc(brw->bufmgr, "shader time",
+                   max_entries * BRW_SHADER_TIME_STRIDE * 3, 4096);
    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
@@ -586,21 +633,20 @@ brw_collect_shader_time(struct brw_context *brw)
     * delaying reading the reports, but it doesn't look like it's a big
     * overhead compared to the cost of tracking the time in the first place.
     */
-   drm_intel_bo_map(brw->shader_time.bo, true);
-   void *bo_map = brw->shader_time.bo->virtual;
+   void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
-      uint32_t *times = bo_map + i * 3 * SHADER_TIME_STRIDE;
+      uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 
-      brw->shader_time.cumulative[i].time += times[SHADER_TIME_STRIDE * 0 / 4];
-      brw->shader_time.cumulative[i].written += times[SHADER_TIME_STRIDE * 1 / 4];
-      brw->shader_time.cumulative[i].reset += times[SHADER_TIME_STRIDE * 2 / 4];
+      brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
+      brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
+      brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
    }
 
    /* Zero the BO out to clear it out for our next collection.
     */
    memset(bo_map, 0, brw->shader_time.bo->size);
-   drm_intel_bo_unmap(brw->shader_time.bo);
+   brw_bo_unmap(brw->shader_time.bo);
 }
 
 void
@@ -623,29 +669,25 @@ brw_collect_and_report_shader_time(struct brw_context *brw)
  * change their lifetimes compared to normal operation.
  */
 int
-brw_get_shader_time_index(struct brw_context *brw,
-                          struct gl_shader_program *shader_prog,
-                          struct gl_program *prog,
-                          enum shader_time_shader_type type)
+brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
+                          enum shader_time_shader_type type, bool is_glsl_sh)
 {
    int shader_time_index = brw->shader_time.num_entries++;
    assert(shader_time_index < brw->shader_time.max_entries);
    brw->shader_time.types[shader_time_index] = type;
 
-   int id = shader_prog ? shader_prog->Name : prog->Id;
    const char *name;
-   if (id == 0) {
+   if (prog->Id == 0) {
       name = "ff";
-   } else if (!shader_prog) {
-      name = "prog";
-   } else if (shader_prog->Label) {
-      name = ralloc_strdup(brw->shader_time.names, shader_prog->Label);
+   } else if (is_glsl_sh) {
+      name = prog->info.label ?
+         ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
    } else {
-      name = "glsl";
+      name = "prog";
    }
 
    brw->shader_time.names[shader_time_index] = name;
-   brw->shader_time.ids[shader_time_index] = id;
+   brw->shader_time.ids[shader_time_index] = prog->Id;
 
    return shader_time_index;
 }
@@ -653,7 +695,7 @@ brw_get_shader_time_index(struct brw_context *brw,
 void
 brw_destroy_shader_time(struct brw_context *brw)
 {
-   drm_intel_bo_unreference(brw->shader_time.bo);
+   brw_bo_unreference(brw->shader_time.bo);
    brw->shader_time.bo = NULL;
 }
 
@@ -664,26 +706,14 @@ brw_stage_prog_data_free(const void *p)
 
    ralloc_free(prog_data->param);
    ralloc_free(prog_data->pull_param);
-   ralloc_free(prog_data->image_param);
 }
 
 void
-brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
-            struct gl_linked_shader *shader, struct gl_program *prog)
+brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 {
-   if (shader_prog) {
-      if (shader->ir) {
-         fprintf(stderr,
-                 "GLSL IR for native %s shader %d:\n",
-                 stage, shader_prog->Name);
-         _mesa_print_ir(stderr, shader->ir, NULL);
-         fprintf(stderr, "\n\n");
-      }
-   } else {
-      fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
-              stage, prog->Id, stage);
-      _mesa_print_program(prog);
-   }
+   fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
+           stage, prog->Id, stage);
+   _mesa_print_program(prog);
 }
 
 void
@@ -691,7 +721,8 @@ brw_setup_tex_for_precompile(struct brw_context *brw,
                              struct brw_sampler_prog_key_data *tex,
                              struct gl_program *prog)
 {
-   const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
    for (unsigned i = 0; i < sampler_count; i++) {
       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
@@ -704,3 +735,117 @@ brw_setup_tex_for_precompile(struct brw_context *brw,
       }
    }
 }
+
+/**
+ * Sets up the starting offsets for the groups of binding table entries
+ * common to all pipeline stages.
+ *
+ * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
+ * unused but also make sure that addition of small offsets to them will
+ * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
+ */
+uint32_t
+brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
+                                        const struct gl_program *prog,
+                                        struct brw_stage_prog_data *stage_prog_data,
+                                        uint32_t next_binding_table_offset)
+{
+   int num_textures = util_last_bit(prog->SamplersUsed);
+
+   stage_prog_data->binding_table.texture_start = next_binding_table_offset;
+   next_binding_table_offset += num_textures;
+
+   if (prog->info.num_ubos) {
+      assert(prog->info.num_ubos <= BRW_MAX_UBO);
+      stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
+      next_binding_table_offset += prog->info.num_ubos;
+   } else {
+      stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
+   }
+
+   if (prog->info.num_ssbos || prog->info.num_abos) {
+      assert(prog->info.num_abos <= BRW_MAX_ABO);
+      assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
+      stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
+      next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
+   } else {
+      stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
+   }
+
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
+      next_binding_table_offset++;
+   } else {
+      stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
+   }
+
+   if (prog->info.uses_texture_gather) {
+      if (devinfo->gen >= 8) {
+         stage_prog_data->binding_table.gather_texture_start =
+            stage_prog_data->binding_table.texture_start;
+      } else {
+         stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
+         next_binding_table_offset += num_textures;
+      }
+   } else {
+      stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
+   }
+
+   if (prog->info.num_images) {
+      stage_prog_data->binding_table.image_start = next_binding_table_offset;
+      next_binding_table_offset += prog->info.num_images;
+   } else {
+      stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
+   }
+
+   /* This may or may not be used depending on how the compile goes. */
+   stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
+   next_binding_table_offset++;
+
+   /* Plane 0 is just the regular texture section */
+   stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
+
+   stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
+   next_binding_table_offset += num_textures;
+
+   stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
+   next_binding_table_offset += num_textures;
+
+   /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
+
+   assert(next_binding_table_offset <= BRW_MAX_SURFACES);
+   return next_binding_table_offset;
+}
+
+void
+brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
+{
+   struct blob writer;
+   blob_init(&writer);
+   nir_serialize(&writer, prog->nir);
+   prog->driver_cache_blob = ralloc_size(NULL, writer.size);
+   memcpy(prog->driver_cache_blob, writer.data, writer.size);
+   prog->driver_cache_blob_size = writer.size;
+   blob_finish(&writer);
+}
+
+void
+brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
+                            gl_shader_stage stage)
+{
+   if (!prog->nir) {
+      assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0);
+      const struct nir_shader_compiler_options *options =
+         ctx->Const.ShaderCompilerOptions[stage].NirOptions;
+      struct blob_reader reader;
+      blob_reader_init(&reader, prog->driver_cache_blob,
+                       prog->driver_cache_blob_size);
+      prog->nir = nir_deserialize(NULL, options, &reader);
+   }
+
+   if (prog->driver_cache_blob) {
+      ralloc_free(prog->driver_cache_blob);
+      prog->driver_cache_blob = NULL;
+      prog->driver_cache_blob_size = 0;
+   }
+}