intel/fs,vec4: Stuff the constant data from NIR in the end of the program
[mesa.git] / src / intel / compiler / brw_fs.cpp
index 63a724e07b7eefbcc754e668b2fd6c974521beaf..460a36587a1ba3d167cd8b50853a23cdf6acc9ac 100644 (file)
@@ -3930,7 +3930,20 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
       high.offset = inst->dst.offset % REG_SIZE;
 
       if (devinfo->gen >= 7) {
-         if (inst->src[1].abs)
+         /* From GEN:BUG:1604601757:
+          *
+          * "When multiplying a DW and any lower precision integer, source modifier
+          *  is not supported."
+          *
+          * An unsupported negate modifier on src[1] would ordinarily be
+          * lowered by the subsequent lower_regioning pass.  In this case that
+          * pass would spawn another dword multiply.  Instead, lower the
+          * modifier first.
+          */
+         const bool source_mods_unsupported = (devinfo->gen >= 12);
+
+         if (inst->src[1].abs || (inst->src[1].negate &&
+                                  source_mods_unsupported))
             lower_src_modifiers(this, block, inst, 1);
 
          if (inst->src[1].file == IMM) {
@@ -7220,24 +7233,6 @@ fs_visitor::setup_fs_payload_gen6()
    assert(dispatch_width % payload_width == 0);
    assert(devinfo->gen >= 6);
 
-   prog_data->uses_src_depth = prog_data->uses_src_w =
-      (nir->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD)) != 0;
-
-   prog_data->uses_sample_mask =
-      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
-
-   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
-    *
-    *    "MSDISPMODE_PERSAMPLE is required in order to select
-    *    POSOFFSET_SAMPLE"
-    *
-    * So we can only really get sample positions if we are doing real
-    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
-    * persample dispatch, we hard-code it to 0.5.
-    */
-   prog_data->uses_pos_offset = prog_data->persample_dispatch &&
-      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
-
    /* R0: PS thread payload header. */
    payload.num_regs++;
 
@@ -7846,7 +7841,7 @@ fs_visitor::allocate_registers(bool allow_spilling)
 
       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
 
-      if (stage == MESA_SHADER_COMPUTE) {
+      if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
          if (devinfo->is_haswell) {
             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
              * field documentation, Haswell supports a minimum of 2kB of
@@ -8234,7 +8229,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 bool
 fs_visitor::run_cs(bool allow_spilling)
 {
-   assert(stage == MESA_SHADER_COMPUTE);
+   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
 
    setup_cs_payload();
 
@@ -8353,7 +8348,7 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
 {
    prog_data->flat_inputs = 0;
 
-   nir_foreach_variable(var, &shader->inputs) {
+   nir_foreach_shader_in_variable(var, shader) {
       unsigned slots = glsl_count_attribute_slots(var->type, false);
       for (unsigned s = 0; s < slots; s++) {
          int input_index = prog_data->urb_setup[var->data.location + s];
@@ -8455,9 +8450,8 @@ brw_nir_move_interpolation_to_top(nir_shader *nir)
             }
          }
       }
-      nir_metadata_preserve(f->impl, (nir_metadata)
-                            ((unsigned) nir_metadata_block_index |
-                             (unsigned) nir_metadata_dominance));
+      nir_metadata_preserve(f->impl, nir_metadata_block_index |
+                                     nir_metadata_dominance);
    }
 
    return progress;
@@ -8501,14 +8495,71 @@ brw_nir_demote_sample_qualifiers(nir_shader *nir)
          }
       }
 
-      nir_metadata_preserve(f->impl, (nir_metadata)
-                            ((unsigned) nir_metadata_block_index |
-                             (unsigned) nir_metadata_dominance));
+      nir_metadata_preserve(f->impl, nir_metadata_block_index |
+                                     nir_metadata_dominance);
    }
 
    return progress;
 }
 
+void
+brw_nir_populate_wm_prog_data(const nir_shader *shader,
+                              const struct gen_device_info *devinfo,
+                              const struct brw_wm_prog_key *key,
+                              struct brw_wm_prog_data *prog_data)
+{
+   prog_data->uses_src_depth = prog_data->uses_src_w =
+      shader->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_FRAG_COORD);
+
+   /* key->alpha_test_func means simulating alpha testing via discards,
+    * so the shader definitely kills pixels.
+    */
+   prog_data->uses_kill = shader->info.fs.uses_discard ||
+      key->alpha_test_func;
+   prog_data->uses_omask = !key->ignore_sample_mask_out &&
+      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   prog_data->persample_dispatch =
+      key->multisample_fbo &&
+      (key->persample_interp ||
+       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+                                            SYSTEM_BIT_SAMPLE_POS)) ||
+       shader->info.fs.uses_sample_qualifier ||
+       shader->info.outputs_read);
+
+   if (devinfo->gen >= 6) {
+      prog_data->uses_sample_mask =
+         shader->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN;
+
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset = prog_data->persample_dispatch &&
+         (shader->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+   }
+
+   prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
+
+   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info.fs.inner_coverage;
+
+   prog_data->barycentric_interp_modes =
+      brw_compute_barycentric_interp_modes(devinfo, shader);
+
+   calculate_urb_setup(devinfo, key, prog_data, shader);
+   brw_compute_flat_inputs(prog_data, shader);
+}
+
 /**
  * Pre-gen6, the register file of the EUs was shared between threads,
  * and each thread used some subset allocated on a 16-register block
@@ -8560,36 +8611,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    NIR_PASS_V(shader, brw_nir_move_interpolation_to_top);
    brw_postprocess_nir(shader, compiler, true);
 
-   /* key->alpha_test_func means simulating alpha testing via discards,
-    * so the shader definitely kills pixels.
-    */
-   prog_data->uses_kill = shader->info.fs.uses_discard ||
-      key->alpha_test_func;
-   prog_data->uses_omask = !key->ignore_sample_mask_out &&
-      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
-   prog_data->computed_depth_mode = computed_depth_mode(shader);
-   prog_data->computed_stencil =
-      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
-
-   prog_data->persample_dispatch =
-      key->multisample_fbo &&
-      (key->persample_interp ||
-       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
-                                            SYSTEM_BIT_SAMPLE_POS)) ||
-       shader->info.fs.uses_sample_qualifier ||
-       shader->info.outputs_read);
-
-   prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
-
-   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
-   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
-   prog_data->inner_coverage = shader->info.fs.inner_coverage;
-
-   prog_data->barycentric_interp_modes =
-      brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
-
-   calculate_urb_setup(devinfo, key, prog_data, shader);
-   brw_compute_flat_inputs(prog_data, shader);
+   brw_nir_populate_wm_prog_data(shader, compiler->devinfo, key, prog_data);
 
    fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
@@ -8646,10 +8668,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
       }
    }
 
+   const bool simd16_failed = v16 && !simd16_cfg;
+
    /* Currently, the compiler only supports SIMD32 on SNB+ */
    if (!has_spilled &&
        v8->max_dispatch_width >= 32 && !use_rep_send &&
-       devinfo->gen >= 6 && simd16_cfg &&
+       devinfo->gen >= 6 && !simd16_failed &&
        !(INTEL_DEBUG & DEBUG_NO32)) {
       /* Try a SIMD32 compile */
       v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
@@ -8760,6 +8784,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
       stats = stats ? stats + 1 : NULL;
    }
 
+   g.add_const_data(shader->constant_data, shader->constant_data_size);
+
    delete v8;
    delete v16;
    delete v32;
@@ -8770,7 +8796,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
 fs_reg *
 fs_visitor::emit_cs_work_group_id_setup()
 {
-   assert(stage == MESA_SHADER_COMPUTE);
+   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
 
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
 
@@ -8922,7 +8948,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    prog_data->base.total_shared = src_shader->info.cs.shared_size;
-   prog_data->slm_size = src_shader->num_shared;
+   prog_data->slm_size = src_shader->shared_size;
 
    /* Generate code for all the possible SIMD variants. */
    bool generate_all;
@@ -9043,7 +9069,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     *
     * TODO: Use performance_analysis and drop this boolean.
     */
-   const bool needs_32 = min_dispatch_width > 16 ||
+   const bool needs_32 = v == NULL ||
                          (INTEL_DEBUG & DEBUG_DO32) ||
                          generate_all;
 
@@ -9143,6 +9169,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                       v->performance_analysis.require(), stats);
    }
 
+   g.add_const_data(src_shader->constant_data, src_shader->constant_data_size);
+
    ret = g.get_assembly();
 
    delete v8;