Merge remote-tracking branch 'public/master' into vulkan

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index b7ea1bf8d9367c5a2aff42963b9150b3a08d088f..3f307f4ef70cb38b70b44ac6d4c80a1f5a4cf3f7 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -738,18 +738,20 @@ fs_inst::components_read(unsigned i) const
     case SHADER_OPCODE_LOD_LOGICAL:
     case SHADER_OPCODE_TG4_LOGICAL:
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
-      assert(src[9].file == IMM && src[10].file == IMM);
+      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
+             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
        /* Texture coordinates. */
-      if (i == 0)
-         return src[9].ud;
+      if (i == TEX_LOGICAL_SRC_COORDINATE)
+         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
        /* Texture derivatives. */
-      else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
-         return src[10].ud;
+      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
+               opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
        /* Texture offset. */
-      else if (i == 8)
+      else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE)
           return 2;
        /* MCS */
-      else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+      else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
           return 2;
        else
           return 1;
@@ -1929,8 +1931,8 @@ fs_visitor::compact_virtual_grfs()
  void
  fs_visitor::assign_constant_locations()
  {
-   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
-   if (dispatch_width != 8)
+   /* Only the first compile gets to decide on locations. */
+   if (dispatch_width != min_dispatch_width)
        return;
  
     bool is_live[uniforms];
@@ -2305,17 +2307,6 @@ fs_visitor::opt_algebraic()
              progress = true;
           }
           break;
-      case SHADER_OPCODE_RCP: {
-         fs_inst *prev = (fs_inst *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(prev->dst)) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
        case SHADER_OPCODE_BROADCAST:
           if (is_uniform(inst->src[0])) {
              inst->opcode = BRW_OPCODE_MOV;
@@ -2472,8 +2463,10 @@ fs_visitor::opt_sampler_eot()
      * we have enough space, but it will make sure the dead code eliminator kills
      * the instruction that this will replace.
      */
-   if (tex_inst->header_size != 0)
+   if (tex_inst->header_size != 0) {
+      invalidate_live_intervals();
        return true;
+   }
  
     fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
                                    load_payload->sources + 1);
@@ -2504,6 +2497,7 @@ fs_visitor::opt_sampler_eot()
     tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
     tex_inst->src[0] = send_header;
  
+   invalidate_live_intervals();
     return true;
  }
  
@@ -2828,17 +2822,15 @@ fs_visitor::emit_repclear_shader()
     int color_mrf = base_mrf + 2;
     fs_inst *mov;
  
-   if (uniforms == 1) {
+   if (uniforms > 0) {
        mov = bld.exec_all().group(4, 0)
                 .MOV(brw_message_reg(color_mrf),
                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
     } else {
        struct brw_reg reg =
-         brw_reg(BRW_GENERAL_REGISTER_FILE,
-                 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
-                 BRW_VERTICAL_STRIDE_8,
-                 BRW_WIDTH_2,
-                 BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  
        mov = bld.exec_all().group(4, 0)
                 .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
@@ -2871,7 +2863,7 @@ fs_visitor::emit_repclear_shader()
     assign_curb_setup();
  
     /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   if (uniforms == 1) {
+   if (uniforms > 0) {
        assert(mov->src[0].file == FIXED_GRF);
        mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
     }
@@ -3522,6 +3514,36 @@ fs_visitor::lower_integer_multiplication()
     return progress;
  }
  
+bool
+fs_visitor::lower_minmax()
+{
+   assert(devinfo->gen < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+          *        the original SEL.L/GE instruction
+          */
+         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                  inst->conditional_mod);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
  static void
  setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
                      fs_reg *dst, fs_reg color, unsigned components)
@@ -4136,18 +4158,19 @@ static void
  lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
  {
     const brw_device_info *devinfo = bld.shader->devinfo;
-   const fs_reg &coordinate = inst->src[0];
-   const fs_reg &shadow_c = inst->src[1];
-   const fs_reg &lod = inst->src[2];
-   const fs_reg &lod2 = inst->src[3];
-   const fs_reg &sample_index = inst->src[4];
-   const fs_reg &mcs = inst->src[5];
-   const fs_reg &surface = inst->src[6];
-   const fs_reg &sampler = inst->src[7];
-   const fs_reg &offset_value = inst->src[8];
-   assert(inst->src[9].file == IMM && inst->src[10].file == IMM);
-   const unsigned coord_components = inst->src[9].ud;
-   const unsigned grad_components = inst->src[10].ud;
+   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
+   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
+   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
+   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
+   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
+   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
+   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
+   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
+   const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE];
+   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
+   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
  
     if (devinfo->gen >= 7) {
        lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
@@ -4442,7 +4465,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
  
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
        /* gather4_po_c is unsupported in SIMD16 mode. */
-      const fs_reg &shadow_c = inst->src[1];
+      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
        return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
     }
     case SHADER_OPCODE_TXL_LOGICAL:
@@ -4451,7 +4474,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
         * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
         * mode because the message exceeds the maximum length of 11.
         */
-      const fs_reg &shadow_c = inst->src[1];
+      const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
        if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
           return 16;
        else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
@@ -4474,7 +4497,8 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
         * circumstances it can end up with a message that is too long in SIMD16
         * mode.
         */
-      const unsigned coord_components = inst->src[8].ud;
+      const unsigned coord_components =
+         inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
        /* First three arguments are the sample index and the two arguments for
         * the MCS data.
         */
@@ -4786,7 +4810,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
        case IMM:
           switch (inst->src[i].type) {
           case BRW_REGISTER_TYPE_F:
-            fprintf(file, "%ff", inst->src[i].f);
+            fprintf(file, "%-gf", inst->src[i].f);
              break;
           case BRW_REGISTER_TYPE_W:
           case BRW_REGISTER_TYPE_D:
@@ -4894,10 +4918,12 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
  }
  
  void
-fs_visitor::setup_payload_gen6()
+fs_visitor::setup_fs_payload_gen6()
  {
-   bool uses_depth =
-      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   assert(stage == MESA_SHADER_FRAGMENT);
+   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
     unsigned barycentric_interp_modes =
        (stage == MESA_SHADER_FRAGMENT) ?
        ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
@@ -4926,7 +4952,9 @@ fs_visitor::setup_payload_gen6()
     }
  
     /* R27: interpolated depth if uses source depth */
-   if (uses_depth) {
+   prog_data->uses_src_depth =
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_depth) {
        payload.source_depth_reg = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
@@ -4934,8 +4962,11 @@ fs_visitor::setup_payload_gen6()
           payload.num_regs++;
        }
     }
+
     /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
-   if (uses_depth) {
+   prog_data->uses_src_w =
+      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_w) {
        payload.source_w_reg = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
@@ -4944,19 +4975,17 @@ fs_visitor::setup_payload_gen6()
        }
     }
  
-   if (stage == MESA_SHADER_FRAGMENT) {
-      brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-      brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-      prog_data->uses_pos_offset = key->compute_pos_offset;
-      /* R31: MSAA position offsets. */
-      if (prog_data->uses_pos_offset) {
-         payload.sample_pos_reg = payload.num_regs;
-         payload.num_regs++;
-      }
+   prog_data->uses_pos_offset = key->compute_pos_offset;
+   /* R31: MSAA position offsets. */
+   if (prog_data->uses_pos_offset) {
+      payload.sample_pos_reg = payload.num_regs;
+      payload.num_regs++;
     }
  
     /* R32: MSAA input coverage mask */
-   if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
+   prog_data->uses_sample_mask =
+      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+   if (prog_data->uses_sample_mask) {
        assert(devinfo->gen >= 7);
        payload.sample_mask_in_reg = payload.num_regs;
        payload.num_regs++;
@@ -5125,7 +5154,7 @@ fs_visitor::optimize()
  
     if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
        char filename[64];
-      snprintf(filename, 64, "%s%d-%s-00-start",
+      snprintf(filename, 64, "%s%d-%s-00-00-start",
                 stage_abbrev, dispatch_width, nir->info.name);
  
        backend_shader::dump_instructions(filename);
@@ -5178,6 +5207,13 @@ fs_visitor::optimize()
     OPT(opt_combine_constants);
     OPT(lower_integer_multiplication);
  
+   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_copy_propagate);
+      OPT(dead_code_eliminate);
+   }
+
     lower_uniform_pull_constant_loads();
  
     validate();
@@ -5190,12 +5226,18 @@ fs_visitor::optimize()
  void
  fs_visitor::fixup_3src_null_dest()
  {
+   bool progress = false;
+
     foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
        if (inst->is_3src() && inst->dst.is_null()) {
           inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                              inst->dst.type);
+         progress = true;
        }
     }
+
+   if (progress)
+      invalidate_live_intervals();
  }
  
  void
@@ -5231,7 +5273,7 @@ fs_visitor::allocate_registers()
         * SIMD8.  There's probably actually some intermediate point where
         * SIMD16 with a couple of spills is still better.
         */
-      if (dispatch_width == 16) {
+      if (dispatch_width == 16 && min_dispatch_width <= 8) {
           fail("Failure to register allocate.  Reduce number of "
                "live scalar values to avoid this.");
        } else {
@@ -5393,9 +5435,9 @@ fs_visitor::run_fs(bool do_rep_send)
     assert(stage == MESA_SHADER_FRAGMENT);
  
     if (devinfo->gen >= 6)
-      setup_payload_gen6();
+      setup_fs_payload_gen6();
     else
-      setup_payload_gen4();
+      setup_fs_payload_gen4();
  
     if (0) {
        emit_dummy_fs();
@@ -5473,6 +5515,13 @@ fs_visitor::run_cs()
     if (shader_time_index >= 0)
        emit_shader_time_begin();
  
+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
     emit_nir_code();
  
     if (failed)
@@ -5563,6 +5612,31 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
     return barycentric_interp_modes;
  }
  
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        bool shade_model_flat, const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      enum glsl_interp_qualifier interp_qualifier =
+         (enum glsl_interp_qualifier)var->data.interpolation;
+      bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+                         (var->data.location == VARYING_SLOT_COL1);
+
+      int input_index = prog_data->urb_setup[var->data.location];
+
+      if (input_index < 0)
+        continue;
+
+      /* flat shading */
+      if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+          (shade_model_flat && is_gl_Color &&
+           interp_qualifier == INTERP_QUALIFIER_NONE))
+         prog_data->flat_inputs |= (1 << input_index);
+   }
+}
+
  static uint8_t
  computed_depth_mode(const nir_shader *shader)
  {
@@ -5597,6 +5671,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
     shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                        true);
+   brw_nir_lower_fs_inputs(shader);
+   brw_nir_lower_fs_outputs(shader);
     shader = brw_postprocess_nir(shader, compiler->devinfo, true);
  
     /* key->alpha_test_func means simulating alpha testing via discards,
@@ -5645,6 +5721,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
        }
     }
  
+   /* We have to compute the flat inputs after the visitor is finished running
+    * because it relies on prog_data->urb_setup which is computed in
+    * fs_visitor::calculate_urb_setup().
+    */
+   brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
     cfg_t *simd8_cfg;
     int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
     if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
@@ -5724,6 +5806,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
     shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
                                        true);
+   brw_nir_lower_cs_shared(shader);
+   prog_data->base.total_shared += shader->num_shared;
     shader = brw_postprocess_nir(shader, compiler->devinfo, true);
  
     prog_data->local_size[0] = shader->info.cs.local_size[0];
@@ -5734,6 +5818,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        shader->info.cs.local_size[2];
  
     unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
  
     cfg_t *cfg = NULL;
     const char *fail_msg = NULL;
@@ -5743,11 +5828,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
                   NULL, /* Never used in core profile */
                   shader, 8, shader_time_index);
-   if (!v8.run_cs()) {
-      fail_msg = v8.fail_msg;
-   } else if (local_workgroup_size <= 8 * max_cs_threads) {
-      cfg = v8.cfg;
-      prog_data->simd_size = 8;
+   if (simd_required <= 8) {
+      if (!v8.run_cs()) {
+         fail_msg = v8.fail_msg;
+      } else {
+         cfg = v8.cfg;
+         prog_data->simd_size = 8;
+      }
     }
  
     fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
@@ -5757,7 +5844,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         !fail_msg && !v8.simd16_unsupported &&
         local_workgroup_size <= 16 * max_cs_threads) {
        /* Try a SIMD16 compile */
-      v16.import_uniforms(&v8);
+      if (simd_required <= 8)
+         v16.import_uniforms(&v8);
        if (!v16.run_cs()) {
           compiler->shader_perf_log(log_data,
                                     "SIMD16 shader failed to compile: %s",