Added few more stubs so that control reaches to DestroyDevice().

[mesa.git] / src / intel / compiler / brw_vec4_tcs.cpp
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp

index 1f0a69f371b2b54697dd4b70862d3f484d03f418..0e4c02ed4043ba59e92daad56da63a32b5ba6852 100644 (file)
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -30,7 +30,7 @@
  #include "brw_nir.h"
  #include "brw_vec4_tcs.h"
  #include "brw_fs.h"
-#include "common/gen_debug.h"
+#include "dev/gen_debug.h"
  
  namespace brw {
  
@@ -42,7 +42,7 @@ vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
                                     void *mem_ctx,
                                     int shader_time_index,
                                     const struct brw_vue_map *input_vue_map)
-   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+   : vec4_visitor(compiler, log_data, &key->base.tex, &prog_data->base,
                    nir, mem_ctx, false, shader_time_index),
       input_vue_map(input_vue_map), key(key)
  {
@@ -257,45 +257,18 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                 brw_imm_d(key->input_vertices)));
        break;
     case nir_intrinsic_load_per_vertex_input: {
+      assert(nir_dest_bit_size(instr->dest) == 32);
        src_reg indirect_offset = get_indirect_offset(instr);
        unsigned imm_offset = instr->const_index[0];
  
-      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
-      src_reg vertex_index =
-         vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
-                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+      src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
+                                    BRW_REGISTER_TYPE_UD);
  
        unsigned first_component = nir_intrinsic_component(instr);
-      if (nir_dest_bit_size(instr->dest) == 64) {
-         /* We need to emit up to two 32-bit URB reads, then shuffle
-          * the result into a temporary, then move to the destination
-          * honoring the writemask
-          *
-          * We don't need to divide first_component by 2 because
-          * emit_input_urb_read takes a 32-bit type.
-          */
-         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
-         dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
-         emit_input_urb_read(tmp_d, vertex_index, imm_offset,
-                             first_component, indirect_offset);
-         if (instr->num_components > 2) {
-            emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index,
-                                imm_offset + 1, 0, indirect_offset);
-         }
-
-         src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF);
-         dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
-         shuffle_64bit_data(shuffled, tmp_src, false);
-
-         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
-         dst.writemask = brw_writemask_for_size(instr->num_components);
-         emit(MOV(dst, src_reg(shuffled)));
-      } else {
-         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
-         dst.writemask = brw_writemask_for_size(instr->num_components);
-         emit_input_urb_read(dst, vertex_index, imm_offset,
-                             first_component, indirect_offset);
-      }
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+      emit_input_urb_read(dst, vertex_index, imm_offset,
+                          first_component, indirect_offset);
        break;
     }
     case nir_intrinsic_load_input:
@@ -315,6 +288,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     }
     case nir_intrinsic_store_output:
     case nir_intrinsic_store_per_vertex_output: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
        src_reg value = get_nir_src(instr->src[0]);
        unsigned mask = instr->const_index[1];
        unsigned swiz = BRW_SWIZZLE_XYZW;
@@ -324,55 +298,61 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
  
        unsigned first_component = nir_intrinsic_component(instr);
        if (first_component) {
-         if (nir_src_bit_size(instr->src[0]) == 64)
-            first_component /= 2;
           assert(swiz == BRW_SWIZZLE_XYZW);
           swiz = BRW_SWZ_COMP_OUTPUT(first_component);
           mask = mask << first_component;
        }
  
-      if (nir_src_bit_size(instr->src[0]) == 64) {
-         /* For 64-bit data we need to shuffle the data before we write and
-          * emit two messages. Also, since each channel is twice as large we
-          * need to fix the writemask in each 32-bit message to account for it.
-          */
-         value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz);
-         dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
-         shuffle_64bit_data(shuffled, value, true);
-         src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
-
-         for (int n = 0; n < 2; n++) {
-            unsigned fixed_mask = 0;
-            if (mask & WRITEMASK_X)
-               fixed_mask |= WRITEMASK_XY;
-            if (mask & WRITEMASK_Y)
-               fixed_mask |= WRITEMASK_ZW;
-            emit_urb_write(shuffled_float, fixed_mask,
-                           imm_offset, indirect_offset);
-
-            shuffled_float = byte_offset(shuffled_float, REG_SIZE);
-            mask >>= 2;
-            imm_offset++;
-         }
-      } else {
-         emit_urb_write(swizzle(value, swiz), mask,
-                        imm_offset, indirect_offset);
-      }
+      emit_urb_write(swizzle(value, swiz), mask,
+                     imm_offset, indirect_offset);
        break;
     }
  
-   case nir_intrinsic_barrier: {
+   case nir_intrinsic_control_barrier: {
        dst_reg header = dst_reg(this, glsl_type::uvec4_type);
        emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
        emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
        break;
     }
  
+   case nir_intrinsic_memory_barrier_tcs_patch:
+      break;
+
     default:
        vec4_visitor::nir_emit_intrinsic(instr);
     }
  }
  
+/**
+ * Return the number of patches to accumulate before an 8_PATCH mode thread is
+ * launched.  In cases with a large number of input control points and a large
+ * amount of VS outputs, the VS URB space needed to store an entire 8 patches
+ * worth of data can be prohibitive, so it can be beneficial to launch threads
+ * early.
+ *
+ * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
+ * values.  Note that 0 means to "disable" early dispatch, meaning to wait for
+ * a full 8 patches as normal.
+ */
+static int
+get_patch_count_threshold(int input_control_points)
+{
+   if (input_control_points <= 4)
+      return 0;
+   else if (input_control_points <= 6)
+      return 5;
+   else if (input_control_points <= 8)
+      return 4;
+   else if (input_control_points <= 10)
+      return 3;
+   else if (input_control_points <= 14)
+      return 2;
+
+   /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
+   return 1;
+}
+
+} /* namespace brw */
  
  extern "C" const unsigned *
  brw_compile_tcs(const struct brw_compiler *compiler,
@@ -380,8 +360,9 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                  void *mem_ctx,
                  const struct brw_tcs_prog_key *key,
                  struct brw_tcs_prog_data *prog_data,
-                const nir_shader *src_shader,
+                nir_shader *nir,
                  int shader_time_index,
+                struct brw_compile_stats *stats,
                  char **error_str)
  {
     const struct gen_device_info *devinfo = compiler->devinfo;
@@ -389,30 +370,48 @@ brw_compile_tcs(const struct brw_compiler *compiler,
     const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
     const unsigned *assembly;
  
-   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
     nir->info.outputs_written = key->outputs_written;
     nir->info.patch_outputs_written = key->patch_outputs_written;
  
     struct brw_vue_map input_vue_map;
     brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
-                       nir->info.separate_shader);
+                       nir->info.separate_shader, 1);
     brw_compute_tess_vue_map(&vue_prog_data->vue_map,
                              nir->info.outputs_written,
                              nir->info.patch_outputs_written);
  
-   nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
+   brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
     brw_nir_lower_vue_inputs(nir, &input_vue_map);
     brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
                               key->tes_primitive_mode);
     if (key->quads_workaround)
        brw_nir_apply_tcs_quads_workaround(nir);
  
-   nir = brw_postprocess_nir(nir, compiler, is_scalar);
+   brw_postprocess_nir(nir, compiler, is_scalar);
  
-   if (is_scalar)
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8);
-   else
-      prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2);
+   bool has_primitive_id =
+      nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID);
+
+   prog_data->patch_count_threshold = brw::get_patch_count_threshold(key->input_vertices);
+
+   if (compiler->use_tcs_8_patch &&
+       nir->info.tess.tcs_vertices_out <= (devinfo->gen >= 12 ? 32 : 16) &&
+       2 + has_primitive_id + key->input_vertices <= (devinfo->gen >= 12 ? 63 : 31)) {
+      /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode. First, the
+       * "Instance" field limits the number of output vertices to [1, 16] on
+       * gen11 and below, or [1, 32] on gen12 and above. Secondly, the
+       * "Dispatch GRF Start Register for URB Data" field is limited to [0,
+       * 31] - which imposes a limit on the input vertices.
+       */
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH;
+      prog_data->instances = nir->info.tess.tcs_vertices_out;
+      prog_data->include_primitive_id = has_primitive_id;
+   } else {
+      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_SINGLE_PATCH;
+      prog_data->instances =
+         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+   }
  
     /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
      * That divides up as follows:
@@ -462,21 +461,19 @@ brw_compile_tcs(const struct brw_compiler *compiler,
     }
  
     if (is_scalar) {
-      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
-                   &prog_data->base.base, NULL, nir, 8,
+      fs_visitor v(compiler, log_data, mem_ctx, &key->base,
+                   &prog_data->base.base, nir, 8,
                     shader_time_index, &input_vue_map);
-      if (!v.run_tcs_single_patch()) {
+      if (!v.run_tcs()) {
           if (error_str)
              *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
           return NULL;
        }
  
        prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
-      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
  
        fs_generator g(compiler, log_data, mem_ctx,
-                     &prog_data->base.base, v.promoted_constants, false,
-                     MESA_SHADER_TESS_CTRL);
+                     &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
        if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
           g.enable_debug(ralloc_asprintf(mem_ctx,
                                          "%s tessellation control shader %s",
@@ -485,11 +482,14 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                                          nir->info.name));
        }
  
-      g.generate_code(v.cfg, 8);
+      g.generate_code(v.cfg, 8, v.shader_stats,
+                      v.performance_analysis.require(), stats);
+
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
  
        assembly = g.get_assembly();
     } else {
-      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+      brw::vec4_tcs_visitor v(compiler, log_data, key, prog_data,
                           nir, mem_ctx, shader_time_index, &input_vue_map);
        if (!v.run()) {
           if (error_str)
@@ -502,11 +502,10 @@ brw_compile_tcs(const struct brw_compiler *compiler,
  
  
        assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
-                                            &prog_data->base, v.cfg);
+                                            &prog_data->base, v.cfg,
+                                            v.performance_analysis.require(),
+                                            stats);
     }
  
     return assembly;
  }
-
-
-} /* namespace brw */