nir: remove dependency on glsl

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 4df3f7e0c841de9199a02f9aa8432bc2c9a277a5..a2fd4411d382769c5a25012af075d1e9e349acb9 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -47,7 +47,7 @@
  #include "brw_dead_control_flow.h"
  #include "main/uniforms.h"
  #include "brw_fs_live_variables.h"
-#include "glsl/glsl_types.h"
+#include "glsl/nir/glsl_types.h"
  #include "program/sampler.h"
  
  using namespace brw;
@@ -210,7 +210,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
     inst->regs_written = regs_written;
  
     if (devinfo->gen < 7) {
-      inst->base_mrf = 13;
+      inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
        inst->header_size = 1;
        if (devinfo->gen == 4)
           inst->mlen = 3;
@@ -1048,11 +1048,11 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
     unsigned int array_elements;
  
     if (type->is_array()) {
-      array_elements = type->length;
+      array_elements = type->arrays_of_arrays_size();
        if (array_elements == 0) {
           fail("dereferenced array '%s' has length 0\n", name);
        }
-      type = type->fields.array;
+      type = type->without_array();
     } else {
        array_elements = 1;
     }
@@ -1401,7 +1401,7 @@ fs_visitor::calculate_urb_setup()
     int urb_next = 0;
     /* Figure out where each of the incoming setup attributes lands. */
     if (devinfo->gen >= 6) {
-      if (_mesa_bitcount_64(prog->InputsRead &
+      if (_mesa_bitcount_64(nir->info.inputs_read &
                              BRW_FS_VARYING_INPUT_MASK) <= 16) {
           /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
            * first 16 varying inputs, so we can put them wherever we want.
@@ -1413,7 +1413,7 @@ fs_visitor::calculate_urb_setup()
            * a different vertex (or geometry) shader.
            */
           for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
-            if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
+            if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
                  BITFIELD64_BIT(i)) {
                 prog_data->urb_setup[i] = urb_next++;
              }
@@ -1427,7 +1427,7 @@ fs_visitor::calculate_urb_setup()
           struct brw_vue_map prev_stage_vue_map;
           brw_compute_vue_map(devinfo, &prev_stage_vue_map,
                               key->input_slots_valid,
-                             shader_prog->SeparateShader);
+                             nir->info.separate_shader);
           int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
           assert(prev_stage_vue_map.num_slots <= first_slot + 32);
           for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
@@ -1437,7 +1437,7 @@ fs_visitor::calculate_urb_setup()
               * unused.
               */
              if (varying != BRW_VARYING_SLOT_COUNT &&
-                (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
+                (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
                   BITFIELD64_BIT(varying))) {
                 prog_data->urb_setup[varying] = slot - first_slot;
              }
@@ -1470,7 +1470,7 @@ fs_visitor::calculate_urb_setup()
         *
         * See compile_sf_prog() for more info.
         */
-      if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+      if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
           prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
     }
  
@@ -1508,21 +1508,14 @@ void
  fs_visitor::assign_vs_urb_setup()
  {
     brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
-   int grf, count, slot, channel, attr;
  
     assert(stage == MESA_SHADER_VERTEX);
-   count = _mesa_bitcount_64(vs_prog_data->inputs_read);
+   int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
     if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
        count++;
  
     /* Each attribute is 4 regs. */
-   this->first_non_payload_grf += count * 4;
-
-   unsigned vue_entries =
-      MAX2(count, vs_prog_data->base.vue_map.num_slots);
-
-   vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
-   vs_prog_data->base.urb_read_length = (count + 1) / 2;
+   this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
  
     assert(vs_prog_data->base.urb_read_length <= 15);
  
@@ -1530,25 +1523,10 @@ fs_visitor::assign_vs_urb_setup()
     foreach_block_and_inst(block, fs_inst, inst, cfg) {
        for (int i = 0; i < inst->sources; i++) {
           if (inst->src[i].file == ATTR) {
-
-            if (inst->src[i].reg == VERT_ATTRIB_MAX) {
-               slot = count - 1;
-            } else {
-               /* Attributes come in in a contiguous block, ordered by their
-                * gl_vert_attrib value.  That means we can compute the slot
-                * number for an attribute by masking out the enabled
-                * attributes before it and counting the bits.
-                */
-               attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
-               slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
-                                        BITFIELD64_MASK(attr));
-            }
-
-            channel = inst->src[i].reg_offset & 3;
-
-            grf = payload.num_regs +
-               prog_data->curb_read_length +
-               slot * 4 + channel;
+            int grf = payload.num_regs +
+                      prog_data->curb_read_length +
+                      inst->src[i].reg +
+                      inst->src[i].reg_offset;
  
              inst->src[i].file = HW_REG;
              inst->src[i].fixed_hw_reg =
@@ -2678,7 +2656,7 @@ fs_visitor::emit_repclear_shader()
  bool
  fs_visitor::remove_duplicate_mrf_writes()
  {
-   fs_inst *last_mrf_move[16];
+   fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
     bool progress = false;
  
     /* Need to update the MRF tracking for compressed instructions. */
@@ -2999,7 +2977,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
            * else does except for register spill/unspill, which generates and
            * uses its MRF within a single IR instruction.
            */
-         inst->base_mrf = 14;
+         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
           inst->mlen = 1;
        }
     }
@@ -4634,7 +4612,7 @@ void
  fs_visitor::setup_payload_gen6()
  {
     bool uses_depth =
-      (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
     unsigned barycentric_interp_modes =
        (stage == MESA_SHADER_FRAGMENT) ?
        ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
@@ -4693,7 +4671,7 @@ fs_visitor::setup_payload_gen6()
     }
  
     /* R32: MSAA input coverage mask */
-   if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
+   if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
        assert(devinfo->gen >= 7);
        payload.sample_mask_in_reg = payload.num_regs;
        payload.num_regs++;
@@ -4706,7 +4684,7 @@ fs_visitor::setup_payload_gen6()
     /* R34-: bary for 32-pixel. */
     /* R58-59: interp W for 32-pixel. */
  
-   if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
        source_depth_to_render_target = true;
     }
  }
@@ -4718,54 +4696,46 @@ fs_visitor::setup_vs_payload()
     payload.num_regs = 2;
  }
  
+/**
+ * We are building the local ID push constant data using the simplest possible
+ * method. We simply push the local IDs directly as they should appear in the
+ * registers for the uvec3 gl_LocalInvocationID variable.
+ *
+ * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
+ * registers worth of push constant space.
+ *
+ * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
+ * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
+ * to coordinated.
+ *
+ * FINISHME: There are a few easy optimizations to consider.
+ *
+ * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
+ *    no need for using push constant space for that dimension.
+ *
+ * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
+ *    easily use 16-bit words rather than 32-bit dwords in the push constant
+ *    data.
+ *
+ * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
+ *    conveying the data, and thereby reduce push constant usage.
+ *
+ */
  void
  fs_visitor::setup_cs_payload()
  {
     assert(devinfo->gen >= 7);
+   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
  
     payload.num_regs = 1;
  
-   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
-      const unsigned local_id_dwords =
-         brw_cs_prog_local_id_payload_dwords(prog, dispatch_width);
-      assert((local_id_dwords & 0x7) == 0);
-      const unsigned local_id_regs = local_id_dwords / 8;
+   if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
+      prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
        payload.local_invocation_id_reg = payload.num_regs;
-      payload.num_regs += local_id_regs;
+      payload.num_regs += prog_data->local_invocation_id_regs;
     }
  }
  
-void
-fs_visitor::assign_fs_binding_table_offsets()
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   uint32_t next_binding_table_offset = 0;
-
-   /* If there are no color regions, we still perform an FB write to a null
-    * renderbuffer, which we place at surface index 0.
-    */
-   prog_data->binding_table.render_target_start = next_binding_table_offset;
-   next_binding_table_offset += MAX2(key->nr_color_regions, 1);
-
-   assign_common_binding_table_offsets(next_binding_table_offset);
-}
-
-void
-fs_visitor::assign_cs_binding_table_offsets()
-{
-   assert(stage == MESA_SHADER_COMPUTE);
-   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
-   uint32_t next_binding_table_offset = 0;
-
-   /* May not be used if the gl_NumWorkGroups variable is not accessed. */
-   prog_data->binding_table.work_groups_start = next_binding_table_offset;
-   next_binding_table_offset++;
-
-   assign_common_binding_table_offsets(next_binding_table_offset);
-}
-
  void
  fs_visitor::calculate_register_pressure()
  {
@@ -4817,8 +4787,8 @@ fs_visitor::optimize()
                                                                          \
        if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
           char filename[64];                                             \
-         snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
-                  stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
+         snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
+                  stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
                                                                          \
           backend_shader::dump_instructions(filename);                   \
        }                                                                 \
@@ -4831,9 +4801,8 @@ fs_visitor::optimize()
  
     if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
        char filename[64];
-      snprintf(filename, 64, "%s%d-%04d-00-start",
-               stage_abbrev, dispatch_width,
-               shader_prog ? shader_prog->Name : 0);
+      snprintf(filename, 64, "%s%d-%s-00-start",
+               stage_abbrev, dispatch_width, nir->info.name);
  
        backend_shader::dump_instructions(filename);
     }
@@ -4855,7 +4824,7 @@ fs_visitor::optimize()
        OPT(opt_algebraic);
        OPT(opt_cse);
        OPT(opt_copy_propagate);
-      OPT(opt_peephole_predicated_break);
+      OPT(opt_predicated_break, this);
        OPT(opt_cmod_propagation);
        OPT(dead_code_eliminate);
        OPT(opt_peephole_sel);
@@ -4979,7 +4948,6 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
  {
     assert(stage == MESA_SHADER_VERTEX);
  
-   assign_common_binding_table_offsets(0);
     setup_vs_payload();
  
     if (shader_time_index >= 0)
@@ -5018,10 +4986,6 @@ fs_visitor::run_fs(bool do_rep_send)
  
     assert(stage == MESA_SHADER_FRAGMENT);
  
-   sanity_param_count = prog->Parameters->NumParameters;
-
-   assign_fs_binding_table_offsets();
-
     if (devinfo->gen >= 6)
        setup_payload_gen6();
     else
@@ -5037,7 +5001,7 @@ fs_visitor::run_fs(bool do_rep_send)
           emit_shader_time_begin();
  
        calculate_urb_setup();
-      if (prog->InputsRead > 0) {
+      if (nir->info.inputs_read > 0) {
           if (devinfo->gen < 6)
              emit_interpolation_setup_gen4();
           else
@@ -5090,13 +5054,6 @@ fs_visitor::run_fs(bool do_rep_send)
     else
        wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
  
-   /* If any state parameters were appended, then ParameterValues could have
-    * been realloced, in which case the driver uniform storage set up by
-    * _mesa_associate_uniform_storage() would point to freed memory.  Make
-    * sure that didn't happen.
-    */
-   assert(sanity_param_count == prog->Parameters->NumParameters);
-
     return !failed;
  }
  
@@ -5104,11 +5061,6 @@ bool
  fs_visitor::run_cs()
  {
     assert(stage == MESA_SHADER_COMPUTE);
-   assert(shader);
-
-   sanity_param_count = prog->Parameters->NumParameters;
-
-   assign_cs_binding_table_offsets();
  
     setup_cs_payload();
  
@@ -5137,13 +5089,6 @@ fs_visitor::run_cs()
     if (failed)
        return false;
  
-   /* If any state parameters were appended, then ParameterValues could have
-    * been realloced, in which case the driver uniform storage set up by
-    * _mesa_associate_uniform_storage() would point to freed memory.  Make
-    * sure that didn't happen.
-    */
-   assert(sanity_param_count == prog->Parameters->NumParameters);
-
     return !failed;
  }
  
@@ -5154,26 +5099,13 @@ brw_wm_fs_emit(struct brw_context *brw,
                 struct brw_wm_prog_data *prog_data,
                 struct gl_fragment_program *fp,
                 struct gl_shader_program *prog,
+               int shader_time_index8, int shader_time_index16,
                 unsigned *final_assembly_size)
  {
-   struct brw_shader *shader = NULL;
-   if (prog)
-      shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-
-   if (unlikely(INTEL_DEBUG & DEBUG_WM))
-      brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
-
-   int st_index8 = -1, st_index16 = -1;
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
-      st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
-      st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
-   }
-
     /* Now the main event: Visit the shader IR and generate our FS IR for it.
      */
-   fs_visitor v(brw->intelScreen->compiler, brw,
-                mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
-                prog, &fp->Base, 8, st_index8);
+   fs_visitor v(brw->intelScreen->compiler, brw, mem_ctx, key,
+                &prog_data->base, &fp->Base, fp->Base.nir, 8, shader_time_index8);
     if (!v.run_fs(false /* do_rep_send */)) {
        if (prog) {
           prog->LinkStatus = false;
@@ -5187,9 +5119,8 @@ brw_wm_fs_emit(struct brw_context *brw,
     }
  
     cfg_t *simd16_cfg = NULL;
-   fs_visitor v2(brw->intelScreen->compiler, brw,
-                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
-                 prog, &fp->Base, 16, st_index16);
+   fs_visitor v2(brw->intelScreen->compiler, brw, mem_ctx, key,
+                 &prog_data->base, &fp->Base, fp->Base.nir, 16, shader_time_index16);
     if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
        if (!v.simd16_unsupported) {
           /* Try a SIMD16 compile */
@@ -5236,6 +5167,42 @@ brw_wm_fs_emit(struct brw_context *brw,
     return g.get_assembly(final_assembly_size);
  }
  
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+                             void *buffer, uint32_t threads, uint32_t stride)
+{
+   if (prog_data->local_invocation_id_regs == 0)
+      return;
+
+   /* 'stride' should be an integer number of registers, that is, a multiple
+    * of 32 bytes.
+    */
+   assert(stride % 32 == 0);
+
+   unsigned x = 0, y = 0, z = 0;
+   for (unsigned t = 0; t < threads; t++) {
+      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+      for (unsigned i = 0; i < prog_data->simd_size; i++) {
+         param[0 * prog_data->simd_size + i] = x;
+         param[1 * prog_data->simd_size + i] = y;
+         param[2 * prog_data->simd_size + i] = z;
+
+         x++;
+         if (x == prog_data->local_size[0]) {
+            x = 0;
+            y++;
+            if (y == prog_data->local_size[1]) {
+               y = 0;
+               z++;
+               if (z == prog_data->local_size[2])
+                  z = 0;
+            }
+         }
+      }
+   }
+}
+
  fs_reg *
  fs_visitor::emit_cs_local_invocation_id_setup()
  {
@@ -5280,45 +5247,35 @@ brw_cs_emit(struct brw_context *brw,
              struct brw_cs_prog_data *prog_data,
              struct gl_compute_program *cp,
              struct gl_shader_program *prog,
+            int shader_time_index,
              unsigned *final_assembly_size)
  {
-   struct brw_shader *shader =
-      (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE];
-
-   if (unlikely(INTEL_DEBUG & DEBUG_CS))
-      brw_dump_ir("compute", prog, &shader->base, &cp->Base);
-
     prog_data->local_size[0] = cp->LocalSize[0];
     prog_data->local_size[1] = cp->LocalSize[1];
     prog_data->local_size[2] = cp->LocalSize[2];
     unsigned local_workgroup_size =
        cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
+   unsigned max_cs_threads = brw->intelScreen->compiler->devinfo->max_cs_threads;
  
     cfg_t *cfg = NULL;
     const char *fail_msg = NULL;
  
-   int st_index = -1;
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
-      st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);
-
     /* Now the main event: Visit the shader IR and generate our CS IR for it.
      */
-   fs_visitor v8(brw->intelScreen->compiler, brw,
-                 mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
-                 &cp->Base, 8, st_index);
+   fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, key,
+                 &prog_data->base, &cp->Base, cp->Base.nir, 8, shader_time_index);
     if (!v8.run_cs()) {
        fail_msg = v8.fail_msg;
-   } else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
+   } else if (local_workgroup_size <= 8 * max_cs_threads) {
        cfg = v8.cfg;
        prog_data->simd_size = 8;
     }
  
-   fs_visitor v16(brw->intelScreen->compiler, brw,
-                  mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
-                  &cp->Base, 16, st_index);
+   fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, key,
+                  &prog_data->base, &cp->Base, cp->Base.nir, 16, shader_time_index);
     if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
         !fail_msg && !v8.simd16_unsupported &&
-       local_workgroup_size <= 16 * brw->max_cs_threads) {
+       local_workgroup_size <= 16 * max_cs_threads) {
        /* Try a SIMD16 compile */
        v16.import_uniforms(&v8);
        if (!v16.run_cs()) {