Merge remote-tracking branch 'public/master' into vulkan

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 6c9ba36a696ce4fb27285387ed547dbf0b1e2488..3f307f4ef70cb38b70b44ac6d4c80a1f5a4cf3f7 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -174,7 +174,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
      * CSE can later notice that those loads are all the same and eliminate
      * the redundant ones.
      */
-   fs_reg vec4_offset = vgrf(glsl_type::int_type);
+   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
     bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
  
     int scale = 1;
@@ -433,7 +433,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) :
  {
     this->reg_offset = 0;
     this->subreg_offset = 0;
-   this->reladdr = NULL;
     this->stride = 1;
     if (this->file == IMM &&
         (this->type != BRW_REGISTER_TYPE_V &&
@@ -448,7 +447,6 @@ fs_reg::equals(const fs_reg &r) const
  {
     return (this->backend_reg::equals(r) &&
             subreg_offset == r.subreg_offset &&
-           !reladdr && !r.reladdr &&
             stride == r.stride);
  }
  
@@ -853,7 +851,10 @@ fs_inst::regs_read(int arg) const
           assert(src[2].file == IMM);
           unsigned region_length = src[2].ud;
  
-         if (src[0].file == FIXED_GRF) {
+         if (src[0].file == UNIFORM) {
+            assert(region_length % 4 == 0);
+            return region_length / 4;
+         } else if (src[0].file == FIXED_GRF) {
              /* If the start of the region is not register aligned, then
               * there's some portion of the register that's technically
               * unread at the beginning.
@@ -867,7 +868,7 @@ fs_inst::regs_read(int arg) const
               * unread portion at the beginning.
               */
              if (src[0].subnr)
-               region_length += src[0].subnr * type_sz(src[0].type);
+               region_length += src[0].subnr;
  
              return DIV_ROUND_UP(region_length, REG_SIZE);
           } else {
@@ -1023,7 +1024,6 @@ fs_visitor::import_uniforms(fs_visitor *v)
     this->push_constant_loc = v->push_constant_loc;
     this->pull_constant_loc = v->pull_constant_loc;
     this->uniforms = v->uniforms;
-   this->param_size = v->param_size;
  }
  
  fs_reg *
@@ -1926,31 +1926,30 @@ fs_visitor::compact_virtual_grfs()
   * maximum number of fragment shader uniform components (64).  If
   * there are too many of these, they'd fill up all of register space.
   * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.  We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
   */
  void
  fs_visitor::assign_constant_locations()
  {
-   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
-   if (dispatch_width != 8)
+   /* Only the first compile gets to decide on locations. */
+   if (dispatch_width != min_dispatch_width)
        return;
  
-   unsigned int num_pull_constants = 0;
-
-   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-   memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
     bool is_live[uniforms];
     memset(is_live, 0, sizeof(is_live));
  
+   /* For each uniform slot, a value of true indicates that the given slot and
+    * the next slot must remain contiguous.  This is used to keep us from
+    * splitting arrays apart.
+    */
+   bool contiguous[uniforms];
+   memset(contiguous, 0, sizeof(contiguous));
+
     /* First, we walk through the instructions and do two things:
      *
      *  1) Figure out which uniforms are live.
      *
-    *  2) Find all indirect access of uniform arrays and flag them as needing
-    *     to go into the pull constant buffer.
+    *  2) Mark any indirectly used ranges of registers as contiguous.
      *
      * Note that we don't move constant-indexed accesses to arrays.  No
      * testing has been done of the performance impact of this choice.
@@ -1960,20 +1959,19 @@ fs_visitor::assign_constant_locations()
           if (inst->src[i].file != UNIFORM)
              continue;
  
-         if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].nr;
+         int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
  
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               assert(param_size[uniform]);
-               for (int j = 0; j < param_size[uniform]; j++)
-                  pull_constant_loc[uniform + j] = num_pull_constants++;
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+            assert(inst->src[2].ud % 4 == 0);
+            unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+            assert(last < uniforms);
+
+            for (unsigned j = constant_nr; j < last; j++) {
+               is_live[j] = true;
+               contiguous[j] = true;
              }
+            is_live[last] = true;
           } else {
-            /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
              if (constant_nr >= 0 && constant_nr < (int) uniforms)
                 is_live[constant_nr] = true;
           }
@@ -1988,29 +1986,48 @@ fs_visitor::assign_constant_locations()
      * If changing this value, note the limitation about total_regs in
      * brw_curbe.c.
      */
-   unsigned int max_push_components = 16 * 8;
+   const unsigned int max_push_components = 16 * 8;
+
+   /* For vulkan we don't limit the max_chunk_size. We set it to 32 float =
+    * 128 bytes, which is the maximum vulkan push constant size.
+    */
+   const unsigned int max_chunk_size = 32;
+
     unsigned int num_push_constants = 0;
+   unsigned int num_pull_constants = 0;
  
     push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  
-   for (unsigned int i = 0; i < uniforms; i++) {
-      if (!is_live[i] || pull_constant_loc[i] != -1) {
-         /* This UNIFORM register is either dead, or has already been demoted
-          * to a pull const.  Mark it as no longer living in the param[] array.
-          */
-         push_constant_loc[i] = -1;
+   int chunk_start = -1;
+   for (unsigned u = 0; u < uniforms; u++) {
+      push_constant_loc[u] = -1;
+      pull_constant_loc[u] = -1;
+
+      if (!is_live[u])
           continue;
-      }
  
-      if (num_push_constants < max_push_components) {
-         /* Retain as a push constant.  Record the location in the params[]
-          * array.
-          */
-         push_constant_loc[i] = num_push_constants++;
-      } else {
-         /* Demote to a pull constant. */
-         push_constant_loc[i] = -1;
-         pull_constant_loc[i] = num_pull_constants++;
+      /* This is the first live uniform in the chunk */
+      if (chunk_start < 0)
+         chunk_start = u;
+
+      /* If this element does not need to be contiguous with the next, we
+       * split at this point and everthing between chunk_start and u forms a
+       * single chunk.
+       */
+      if (!contiguous[u]) {
+         unsigned chunk_size = u - chunk_start + 1;
+
+         if (num_push_constants + chunk_size <= max_push_components &&
+             chunk_size <= max_chunk_size) {
+            for (unsigned j = chunk_start; j <= u; j++)
+               push_constant_loc[j] = num_push_constants++;
+         } else {
+            for (unsigned j = chunk_start; j <= u; j++)
+               pull_constant_loc[j] = num_pull_constants++;
+         }
+
+         chunk_start = -1;
        }
     }
  
@@ -2041,51 +2058,67 @@ fs_visitor::assign_constant_locations()
   * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
   */
  void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
  {
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
        for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != UNIFORM)
             continue;
  
-         int pull_index;
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
           unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
-         if (location >= uniforms) /* Out of bounds access */
-            pull_index = -1;
-         else
-            pull_index = pull_constant_loc[location];
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
  
           if (pull_index == -1)
             continue;
  
-         /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld(this, block, inst);
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst = vgrf(glsl_type::float_type);
-
           assert(inst->src[i].stride == 0);
  
-         /* Generate a pull load into dst. */
-         if (inst->src[i].reladdr) {
-            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       brw_imm_ud(index),
-                                       *inst->src[i].reladdr,
-                                       pull_index * 4);
-            inst->src[i].reladdr = NULL;
-            inst->src[i].stride = 1;
-         } else {
-            const fs_builder ubld = ibld.exec_all().group(8, 0);
-            struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
-            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, brw_imm_ud(index), offset);
-            inst->src[i].set_smear(pull_index & 3);
-         }
-         brw_mark_surface_used(prog_data, index);
+         fs_reg dst = vgrf(glsl_type::float_type);
+         const fs_builder ubld = ibld.exec_all().group(8, 0);
+         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                   dst, brw_imm_ud(index), offset);
  
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
           inst->src[i].reg_offset = 0;
+         inst->src[i].set_smear(pull_index & 3);
+
+         brw_mark_surface_used(prog_data, index);
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+           continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    inst->src[1],
+                                    pull_index * 4);
+         inst->remove(block);
+
+         brw_mark_surface_used(prog_data, index);
        }
     }
     invalidate_live_intervals();
@@ -2274,17 +2307,6 @@ fs_visitor::opt_algebraic()
              progress = true;
           }
           break;
-      case SHADER_OPCODE_RCP: {
-         fs_inst *prev = (fs_inst *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(prev->dst)) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
        case SHADER_OPCODE_BROADCAST:
           if (is_uniform(inst->src[0])) {
              inst->opcode = BRW_OPCODE_MOV;
@@ -2441,8 +2463,10 @@ fs_visitor::opt_sampler_eot()
      * we have enough space, but it will make sure the dead code eliminator kills
      * the instruction that this will replace.
      */
-   if (tex_inst->header_size != 0)
+   if (tex_inst->header_size != 0) {
+      invalidate_live_intervals();
        return true;
+   }
  
     fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
                                    load_payload->sources + 1);
@@ -2473,6 +2497,7 @@ fs_visitor::opt_sampler_eot()
     tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
     tex_inst->src[0] = send_header;
  
+   invalidate_live_intervals();
     return true;
  }
  
@@ -2795,10 +2820,21 @@ fs_visitor::emit_repclear_shader()
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
     int base_mrf = 1;
     int color_mrf = base_mrf + 2;
+   fs_inst *mov;
+
+   if (uniforms > 0) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  
-   fs_inst *mov = bld.exec_all().group(4, 0)
-                     .MOV(brw_message_reg(color_mrf),
-                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
  
     fs_inst *write;
     if (key->nr_color_regions == 1) {
@@ -2827,8 +2863,10 @@ fs_visitor::emit_repclear_shader()
     assign_curb_setup();
  
     /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   assert(mov->src[0].file == FIXED_GRF);
-   mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   if (uniforms > 0) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
  }
  
  /**
@@ -4475,6 +4513,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
        return 8;
  
+   case SHADER_OPCODE_MOV_INDIRECT:
+      /* Prior to Broadwell, we only have 8 address subregisters */
+      return devinfo->gen < 8 ? 8 : inst->exec_size;
+
     default:
        return inst->exec_size;
     }
@@ -4757,9 +4799,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           break;
        case UNIFORM:
           fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
-         if (inst->src[i].reladdr) {
-            fprintf(file, "+reladdr");
-         } else if (inst->src[i].subreg_offset) {
+         if (inst->src[i].subreg_offset) {
              fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                      inst->src[i].subreg_offset);
           }
@@ -4870,7 +4910,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
  {
     if (end == start ||
         end->is_partial_write() ||
-       reg.reladdr ||
         !reg.equals(end->dst)) {
        return NULL;
     } else {
@@ -5088,7 +5127,7 @@ fs_visitor::optimize()
     bld = fs_builder(this, 64);
  
     assign_constant_locations();
-   demote_pull_constants();
+   lower_constant_loads();
  
     validate();
  
@@ -5187,12 +5226,18 @@ fs_visitor::optimize()
  void
  fs_visitor::fixup_3src_null_dest()
  {
+   bool progress = false;
+
     foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
        if (inst->is_3src() && inst->dst.is_null()) {
           inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                              inst->dst.type);
+         progress = true;
        }
     }
+
+   if (progress)
+      invalidate_live_intervals();
  }
  
  void
@@ -5228,7 +5273,7 @@ fs_visitor::allocate_registers()
         * SIMD8.  There's probably actually some intermediate point where
         * SIMD16 with a couple of spills is still better.
         */
-      if (dispatch_width == 16) {
+      if (dispatch_width == 16 && min_dispatch_width <= 8) {
           fail("Failure to register allocate.  Reduce number of "
                "live scalar values to avoid this.");
        } else {
@@ -5470,6 +5515,13 @@ fs_visitor::run_cs()
     if (shader_time_index >= 0)
        emit_shader_time_begin();
  
+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
     emit_nir_code();
  
     if (failed)
@@ -5560,6 +5612,31 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
     return barycentric_interp_modes;
  }
  
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        bool shade_model_flat, const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      enum glsl_interp_qualifier interp_qualifier =
+         (enum glsl_interp_qualifier)var->data.interpolation;
+      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+                         (var->data.location == VARYING_SLOT_COL1);
+
+      int input_index = prog_data->urb_setup[var->data.location];
+
+      if (input_index < 0)
+        continue;
+
+      /* flat shading */
+      if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+          (shade_model_flat && is_gl_Color &&
+           interp_qualifier == INTERP_QUALIFIER_NONE))
+         prog_data->flat_inputs |= (1 << input_index);
+   }
+}
+
  static uint8_t
  computed_depth_mode(const nir_shader *shader)
  {
@@ -5594,7 +5671,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
     shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                        true);
-   shader = brw_nir_lower_io(shader, compiler->devinfo, true, false, NULL);
+   brw_nir_lower_fs_inputs(shader);
+   brw_nir_lower_fs_outputs(shader);
     shader = brw_postprocess_nir(shader, compiler->devinfo, true);
  
     /* key->alpha_test_func means simulating alpha testing via discards,
@@ -5643,6 +5721,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
        }
     }
  
+   /* We have to compute the flat inputs after the visitor is finished running
+    * because it relies on prog_data->urb_setup which is computed in
+    * fs_visitor::calculate_urb_setup().
+    */
+   brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
     cfg_t *simd8_cfg;
     int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
     if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
@@ -5722,6 +5806,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
     shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                        true);
+   brw_nir_lower_cs_shared(shader);
+   prog_data->base.total_shared += shader->num_shared;
     shader = brw_postprocess_nir(shader, compiler->devinfo, true);
  
     prog_data->local_size[0] = shader->info.cs.local_size[0];
@@ -5732,6 +5818,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        shader->info.cs.local_size[2];
  
     unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
  
     cfg_t *cfg = NULL;
     const char *fail_msg = NULL;
@@ -5741,11 +5828,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
                   NULL, /* Never used in core profile */
                   shader, 8, shader_time_index);
-   if (!v8.run_cs()) {
-      fail_msg = v8.fail_msg;
-   } else if (local_workgroup_size <= 8 * max_cs_threads) {
-      cfg = v8.cfg;
-      prog_data->simd_size = 8;
+   if (simd_required <= 8) {
+      if (!v8.run_cs()) {
+         fail_msg = v8.fail_msg;
+      } else {
+         cfg = v8.cfg;
+         prog_data->simd_size = 8;
+      }
     }
  
     fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
@@ -5755,7 +5844,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         !fail_msg && !v8.simd16_unsupported &&
         local_workgroup_size <= 16 * max_cs_threads) {
        /* Try a SIMD16 compile */
-      v16.import_uniforms(&v8);
+      if (simd_required <= 8)
+         v16.import_uniforms(&v8);
        if (!v16.run_cs()) {
           compiler->shader_perf_log(log_data,
                                     "SIMD16 shader failed to compile: %s",