i965: call brw_nir_lower_uniforms() after uniform linking is complete
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program.c
index a51349951613aa81ac29ef44bb10d13e64995292..6bb6f84182d8cadff57cb4bf086f6f9cef7d322b 100644 (file)
@@ -30,7 +30,7 @@
   */
 
 #include <pthread.h>
-#include "main/imports.h"
+#include "main/glspirv.h"
 #include "program/prog_parameter.h"
 #include "program/prog_print.h"
 #include "program/prog_to_nir.h"
@@ -39,8 +39,9 @@
 #include "tnl/tnl.h"
 #include "util/ralloc.h"
 #include "compiler/glsl/ir.h"
+#include "compiler/glsl/program.h"
+#include "compiler/glsl/gl_nir.h"
 #include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/nir/nir_serialize.h"
 
 #include "brw_program.h"
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 
+#include "brw_cs.h"
+#include "brw_gs.h"
+#include "brw_vs.h"
+#include "brw_wm.h"
+#include "brw_state.h"
+
+#include "main/shaderapi.h"
+#include "main/shaderobj.h"
+
 static bool
 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
 {
@@ -62,6 +72,10 @@ brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
    }
 }
 
+static struct gl_program *brwNewProgram(struct gl_context *ctx,
+                                        gl_shader_stage stage,
+                                        GLuint id, bool is_arb_asm);
+
 nir_shader *
 brw_create_nir(struct brw_context *brw,
                const struct gl_shader_program *shader_prog,
@@ -69,51 +83,67 @@ brw_create_nir(struct brw_context *brw,
                gl_shader_stage stage,
                bool is_scalar)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    struct gl_context *ctx = &brw->ctx;
    const nir_shader_compiler_options *options =
       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
    nir_shader *nir;
 
-   /* First, lower the GLSL IR or Mesa IR to NIR */
+   /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
    if (shader_prog) {
-      nir = glsl_to_nir(shader_prog, stage, options);
-      nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
-      nir_lower_returns(nir);
-      nir_validate_shader(nir);
+      if (shader_prog->data->spirv) {
+         nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
+      } else {
+         nir = glsl_to_nir(ctx, shader_prog, stage, options);
+
+         /* Remap the locations to slots so those requiring two slots will
+          * occupy two locations. For instance, if we have in the IR code a
+          * dvec3 attr0 in location 0 and vec4 attr1 in location 1, in NIR attr0
+          * will use locations/slots 0 and 1, and attr1 will use location/slot 2
+          */
+         if (nir->info.stage == MESA_SHADER_VERTEX)
+            nir_remap_dual_slot_attributes(nir, &prog->DualSlotInputs);
+      }
+      assert (nir);
+
+      nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out,
+                                NULL);
+      nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
                  nir_shader_get_entrypoint(nir), true, false);
    } else {
       nir = prog_to_nir(prog, options);
       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
    }
-   nir_validate_shader(nir);
+   nir_validate_shader(nir, "before brw_preprocess_nir");
 
-   /* Lower PatchVerticesIn from system value to uniform. This needs to
-    * happen before brw_preprocess_nir, since that will lower system values
-    * to intrinsics.
-    *
-    * We only do this for TES if no TCS is present, since otherwise we know
-    * the number of vertices in the patch at link time and we can lower it
-    * directly to a constant. We do this in nir_lower_patch_vertices, which
-    * needs to run after brw_nir_preprocess has turned the system values
-    * into intrinsics.
-    */
-   const bool lower_patch_vertices_in_to_uniform =
-      (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) ||
-      (stage == MESA_SHADER_TESS_EVAL &&
-       !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 
-   if (lower_patch_vertices_in_to_uniform)
-      brw_nir_lower_patch_vertices_in_to_uniform(nir);
+   if (!ctx->SoftFP64 && nir->info.uses_64bit &&
+       (options->lower_doubles_options & nir_lower_fp64_full_software)) {
+      ctx->SoftFP64 = glsl_float64_funcs_to_nir(ctx, options);
+   }
+
+   brw_preprocess_nir(brw->screen->compiler, nir, ctx->SoftFP64);
 
-   nir = brw_preprocess_nir(brw->screen->compiler, nir);
+   if (stage == MESA_SHADER_TESS_CTRL) {
+      /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
+      static const gl_state_index16 tokens[STATE_LENGTH] =
+         { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
+      nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
+   }
 
-   if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) {
-      assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
-      struct gl_linked_shader *linked_tcs =
+   if (stage == MESA_SHADER_TESS_EVAL) {
+      /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
+       * a uniform if we don't.
+       */
+      struct gl_linked_shader *tcs =
          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
-      uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out;
-      nir_lower_tes_patch_vertices(nir, patch_vertices);
+      uint32_t static_patch_vertices =
+         tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
+      static const gl_state_index16 tokens[STATE_LENGTH] =
+         { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
+      nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
    }
 
    if (stage == MESA_SHADER_FRAGMENT) {
@@ -131,11 +161,46 @@ brw_create_nir(struct brw_context *brw,
       }
    }
 
-   NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
-
    return nir;
 }
 
+static void
+shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+
+   uint32_t comp_size = glsl_type_is_boolean(type)
+      ? 4 : glsl_get_bit_size(type) / 8;
+   unsigned length = glsl_get_vector_elements(type);
+   *size = comp_size * length,
+   *align = comp_size * (length == 3 ? 4 : length);
+}
+
+void
+brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
+                        struct gl_program *prog,
+                        const struct gen_device_info *devinfo)
+{
+   NIR_PASS_V(nir, brw_nir_lower_uniforms, nir->options->lower_to_scalar);
+   NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
+   prog->info.textures_used = prog->nir->info.textures_used;
+   prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
+
+   NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo, NULL);
+
+   if (prog->nir->info.stage == MESA_SHADER_COMPUTE &&
+       shader_prog->data->spirv) {
+      NIR_PASS_V(prog->nir, nir_lower_vars_to_explicit_types,
+                 nir_var_mem_shared, shared_type_info);
+      NIR_PASS_V(prog->nir, nir_lower_explicit_io,
+                 nir_var_mem_shared, nir_address_format_32bit_offset);
+   }
+
+   NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
+   /* Do a round of constant folding to clean up address calculations */
+   NIR_PASS_V(prog->nir, nir_opt_constant_folding);
+}
+
 void
 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 {
@@ -155,7 +220,8 @@ get_new_program_id(struct intel_screen *screen)
    return p_atomic_inc_return(&screen->program_id);
 }
 
-static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
+static struct gl_program *brwNewProgram(struct gl_context *ctx,
+                                        gl_shader_stage stage,
                                         GLuint id, bool is_arb_asm)
 {
    struct brw_context *brw = brw_context(ctx);
@@ -164,7 +230,7 @@ static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
    if (prog) {
       prog->id = get_new_program_id(brw->screen);
 
-      return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
+      return _mesa_init_gl_program(&prog->program, stage, id, is_arb_asm);
    }
 
    return NULL;
@@ -225,10 +291,13 @@ brwProgramStringNotify(struct gl_context *ctx,
 
       if (newFP == curFP)
         brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
+      _mesa_program_fragment_position_to_sysval(&newFP->program);
       newFP->id = get_new_program_id(brw->screen);
 
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 
+      brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
+
       brw_shader_gather_info(prog->nir, prog);
 
       brw_fs_precompile(ctx, prog);
@@ -253,6 +322,8 @@ brwProgramStringNotify(struct gl_context *ctx,
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 
+      brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
+
       brw_shader_gather_info(prog->nir, prog);
 
       brw_vs_precompile(ctx, prog);
@@ -277,9 +348,7 @@ brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   unsigned bits = (PIPE_CONTROL_DATA_CACHE_FLUSH |
-                    PIPE_CONTROL_NO_WRITE |
-                    PIPE_CONTROL_CS_STALL);
+   unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 
    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
@@ -313,12 +382,12 @@ brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 }
 
 static void
-brw_blend_barrier(struct gl_context *ctx)
+brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   if (!ctx->Extensions.MESA_shader_framebuffer_fetch) {
+   if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
       if (devinfo->gen >= 6) {
          brw_emit_pipe_control_flush(brw,
                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
@@ -344,7 +413,8 @@ brw_get_scratch_bo(struct brw_context *brw,
    }
 
    if (!old_bo) {
-      *scratch_bo = brw_bo_alloc(brw->bufmgr, "scratch bo", size, 4096);
+      *scratch_bo =
+         brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
    }
 }
 
@@ -398,27 +468,54 @@ brw_alloc_stage_scratch(struct brw_context *brw,
        * brw->screen->subslice_total is the TOTAL number of subslices
        * and we wish to view that there are 4 subslices per slice
        * instead of the actual number of subslices per slice.
+       *
+       * For, ICL, scratch space allocation is based on the number of threads
+       * in the base configuration.
        */
-      if (devinfo->gen >= 9)
+      if (devinfo->gen == 11)
+         subslices = 8;
+      else if (devinfo->gen >= 9 && devinfo->gen < 11)
          subslices = 4 * brw->screen->devinfo.num_slices;
 
-      /* WaCSScratchSize:hsw
-       *
-       * Haswell's scratch space address calculation appears to be sparse
-       * rather than tightly packed.  The Thread ID has bits indicating
-       * which subslice, EU within a subslice, and thread within an EU
-       * it is.  There's a maximum of two slices and two subslices, so these
-       * can be stored with a single bit.  Even though there are only 10 EUs
-       * per subslice, this is stored in 4 bits, so there's an effective
-       * maximum value of 16 EUs.  Similarly, although there are only 7
-       * threads per EU, this is stored in a 3 bit number, giving an effective
-       * maximum value of 8 threads per EU.
-       *
-       * This means that we need to use 16 * 8 instead of 10 * 7 for the
-       * number of threads per subslice.
-       */
-      const unsigned scratch_ids_per_subslice =
-         devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
+      unsigned scratch_ids_per_subslice;
+      if (devinfo->gen >= 11) {
+         /* The MEDIA_VFE_STATE docs say:
+          *
+          *    "Starting with this configuration, the Maximum Number of
+          *     Threads must be set to (#EU * 8) for GPGPU dispatches.
+          *
+          *     Although there are only 7 threads per EU in the configuration,
+          *     the FFTID is calculated as if there are 8 threads per EU,
+          *     which in turn requires a larger amount of Scratch Space to be
+          *     allocated by the driver."
+          */
+         scratch_ids_per_subslice = 8 * 8;
+      } else if (devinfo->is_haswell) {
+         /* WaCSScratchSize:hsw
+          *
+          * Haswell's scratch space address calculation appears to be sparse
+          * rather than tightly packed. The Thread ID has bits indicating
+          * which subslice, EU within a subslice, and thread within an EU it
+          * is. There's a maximum of two slices and two subslices, so these
+          * can be stored with a single bit. Even though there are only 10 EUs
+          * per subslice, this is stored in 4 bits, so there's an effective
+          * maximum value of 16 EUs. Similarly, although there are only 7
+          * threads per EU, this is stored in a 3 bit number, giving an
+          * effective maximum value of 8 threads per EU.
+          *
+          * This means that we need to use 16 * 8 instead of 10 * 7 for the
+          * number of threads per subslice.
+          */
+         scratch_ids_per_subslice = 16 * 8;
+      } else if (devinfo->is_cherryview) {
+         /* Cherryview devices have either 6 or 8 EUs per subslice, and each
+          * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
+          * as if it had 8 EUs.
+          */
+         scratch_ids_per_subslice = 8 * 7;
+      } else {
+         scratch_ids_per_subslice = devinfo->max_cs_threads;
+      }
 
       thread_count = scratch_ids_per_subslice * subslices;
       break;
@@ -429,7 +526,7 @@ brw_alloc_stage_scratch(struct brw_context *brw,
 
    stage_state->scratch_bo =
       brw_bo_alloc(brw->bufmgr, "shader scratch space",
-                   per_thread_size * thread_count, 4096);
+                   per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 }
 
 void brwInitFragProgFuncs( struct dd_function_table *functions )
@@ -443,7 +540,7 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
    functions->LinkShader = brw_link_shader;
 
    functions->MemoryBarrier = brw_memory_barrier;
-   functions->BlendBarrier = brw_blend_barrier;
+   functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 }
 
 struct shader_times {
@@ -458,7 +555,8 @@ brw_init_shader_time(struct brw_context *brw)
    const int max_entries = 2048;
    brw->shader_time.bo =
       brw_bo_alloc(brw->bufmgr, "shader time",
-                   max_entries * BRW_SHADER_TIME_STRIDE * 3, 4096);
+                   max_entries * BRW_SHADER_TIME_STRIDE * 3,
+                   BRW_MEMZONE_OTHER);
    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
@@ -524,6 +622,7 @@ brw_report_shader_time(struct brw_context *brw)
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
+      case ST_FS32:
       case ST_CS:
          written = brw->shader_time.cumulative[i].written;
          reset = brw->shader_time.cumulative[i].reset;
@@ -552,6 +651,7 @@ brw_report_shader_time(struct brw_context *brw)
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
+      case ST_FS32:
       case ST_CS:
          total_by_type[type] += scaled[i];
          break;
@@ -601,6 +701,9 @@ brw_report_shader_time(struct brw_context *brw)
       case ST_FS16:
          stage = "fs16";
          break;
+      case ST_FS32:
+         stage = "fs32";
+         break;
       case ST_CS:
          stage = "cs";
          break;
@@ -620,6 +723,7 @@ brw_report_shader_time(struct brw_context *brw)
    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
+   print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 }
 
@@ -717,11 +821,10 @@ brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 }
 
 void
-brw_setup_tex_for_precompile(struct brw_context *brw,
+brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
                              struct brw_sampler_prog_key_data *tex,
-                             struct gl_program *prog)
+                             const struct gl_program *prog)
 {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
    for (unsigned i = 0; i < sampler_count; i++) {
@@ -811,41 +914,59 @@ brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
    next_binding_table_offset += num_textures;
 
-   /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
+   /* Set the binding table size.  Some callers may append new entries
+    * and increase this accordingly.
+    */
+   stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 
    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
    return next_binding_table_offset;
 }
 
 void
-brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
+brw_populate_default_key(const struct brw_compiler *compiler,
+                         union brw_any_prog_key *prog_key,
+                         struct gl_shader_program *sh_prog,
+                         struct gl_program *prog)
 {
-   struct blob writer;
-   blob_init(&writer);
-   nir_serialize(&writer, prog->nir);
-   prog->driver_cache_blob = ralloc_size(NULL, writer.size);
-   memcpy(prog->driver_cache_blob, writer.data, writer.size);
-   prog->driver_cache_blob_size = writer.size;
-   blob_finish(&writer);
+   switch (prog->info.stage) {
+   case MESA_SHADER_VERTEX:
+      brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
+      break;
+   case MESA_SHADER_COMPUTE:
+      brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
+      break;
+   default:
+      unreachable("Unsupported stage!");
+   }
 }
 
 void
-brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
-                            gl_shader_stage stage)
+brw_debug_recompile(struct brw_context *brw,
+                    gl_shader_stage stage,
+                    unsigned api_id,
+                    struct brw_base_prog_key *key)
 {
-   if (!prog->nir) {
-      assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0);
-      const struct nir_shader_compiler_options *options =
-         ctx->Const.ShaderCompilerOptions[stage].NirOptions;
-      struct blob_reader reader;
-      blob_reader_init(&reader, prog->driver_cache_blob,
-                       prog->driver_cache_blob_size);
-      prog->nir = nir_deserialize(NULL, options, &reader);
-   }
+   const struct brw_compiler *compiler = brw->screen->compiler;
+   enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 
-   if (prog->driver_cache_blob) {
-      ralloc_free(prog->driver_cache_blob);
-      prog->driver_cache_blob = NULL;
-      prog->driver_cache_blob_size = 0;
-   }
+   compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
+                             _mesa_shader_stage_to_string(stage), api_id);
+
+   const void *old_key =
+      brw_find_previous_compile(&brw->cache, cache_id, key->program_string_id);
+
+   brw_debug_key_recompile(compiler, brw, stage, old_key, key);
 }