nir: Add a new memory_barrier_tcs_patch intrinsic
[mesa.git] / src / intel / compiler / brw_vec4_tcs.cpp
index b8f2a3bde0a0d052b5f4034994d15183ccf1ea63..e539657335ca1d91ac2312e4c3bcdc8188c178f7 100644 (file)
@@ -257,6 +257,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                brw_imm_d(key->input_vertices)));
       break;
    case nir_intrinsic_load_per_vertex_input: {
+      assert(nir_dest_bit_size(instr->dest) == 32);
       src_reg indirect_offset = get_indirect_offset(instr);
       unsigned imm_offset = instr->const_index[0];
 
@@ -264,36 +265,10 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                                     BRW_REGISTER_TYPE_UD);
 
       unsigned first_component = nir_intrinsic_component(instr);
-      if (nir_dest_bit_size(instr->dest) == 64) {
-         /* We need to emit up to two 32-bit URB reads, then shuffle
-          * the result into a temporary, then move to the destination
-          * honoring the writemask
-          *
-          * We don't need to divide first_component by 2 because
-          * emit_input_urb_read takes a 32-bit type.
-          */
-         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
-         dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
-         emit_input_urb_read(tmp_d, vertex_index, imm_offset,
-                             first_component, indirect_offset);
-         if (instr->num_components > 2) {
-            emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index,
-                                imm_offset + 1, 0, indirect_offset);
-         }
-
-         src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF);
-         dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
-         shuffle_64bit_data(shuffled, tmp_src, false);
-
-         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
-         dst.writemask = brw_writemask_for_size(instr->num_components);
-         emit(MOV(dst, src_reg(shuffled)));
-      } else {
-         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
-         dst.writemask = brw_writemask_for_size(instr->num_components);
-         emit_input_urb_read(dst, vertex_index, imm_offset,
-                             first_component, indirect_offset);
-      }
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+      emit_input_urb_read(dst, vertex_index, imm_offset,
+                          first_component, indirect_offset);
       break;
    }
    case nir_intrinsic_load_input:
@@ -313,6 +288,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_store_output:
    case nir_intrinsic_store_per_vertex_output: {
+      assert(nir_src_bit_size(instr->src[0]) == 32);
       src_reg value = get_nir_src(instr->src[0]);
       unsigned mask = instr->const_index[1];
       unsigned swiz = BRW_SWIZZLE_XYZW;
@@ -322,40 +298,13 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       unsigned first_component = nir_intrinsic_component(instr);
       if (first_component) {
-         if (nir_src_bit_size(instr->src[0]) == 64)
-            first_component /= 2;
          assert(swiz == BRW_SWIZZLE_XYZW);
          swiz = BRW_SWZ_COMP_OUTPUT(first_component);
          mask = mask << first_component;
       }
 
-      if (nir_src_bit_size(instr->src[0]) == 64) {
-         /* For 64-bit data we need to shuffle the data before we write and
-          * emit two messages. Also, since each channel is twice as large we
-          * need to fix the writemask in each 32-bit message to account for it.
-          */
-         value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz);
-         dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
-         shuffle_64bit_data(shuffled, value, true);
-         src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
-
-         for (int n = 0; n < 2; n++) {
-            unsigned fixed_mask = 0;
-            if (mask & WRITEMASK_X)
-               fixed_mask |= WRITEMASK_XY;
-            if (mask & WRITEMASK_Y)
-               fixed_mask |= WRITEMASK_ZW;
-            emit_urb_write(shuffled_float, fixed_mask,
-                           imm_offset, indirect_offset);
-
-            shuffled_float = byte_offset(shuffled_float, REG_SIZE);
-            mask >>= 2;
-            imm_offset++;
-         }
-      } else {
-         emit_urb_write(swizzle(value, swiz), mask,
-                        imm_offset, indirect_offset);
-      }
+      emit_urb_write(swizzle(value, swiz), mask,
+                     imm_offset, indirect_offset);
       break;
    }
 
@@ -366,6 +315,9 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       break;
    }
 
+   case nir_intrinsic_memory_barrier_tcs_patch:
+      break;
+
    default:
       vec4_visitor::nir_emit_intrinsic(instr);
    }
@@ -380,6 +332,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                 struct brw_tcs_prog_data *prog_data,
                 nir_shader *nir,
                 int shader_time_index,
+                struct brw_compile_stats *stats,
                 char **error_str)
 {
    const struct gen_device_info *devinfo = compiler->devinfo;
@@ -397,7 +350,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                             nir->info.outputs_written,
                             nir->info.patch_outputs_written);
 
-   brw_nir_apply_sampler_key(nir, compiler, &key->base.tex, is_scalar);
+   brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
    brw_nir_lower_vue_inputs(nir, &input_vue_map);
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
                              key->tes_primitive_mode);
@@ -410,12 +363,13 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID);
 
    if (compiler->use_tcs_8_patch &&
-       nir->info.tess.tcs_vertices_out <= 16 &&
+       nir->info.tess.tcs_vertices_out <= (devinfo->gen >= 12 ? 32 : 16) &&
        2 + has_primitive_id + key->input_vertices <= 31) {
-      /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode.  First,
-       * the "Instance" field limits the number of output vertices to [1, 16].
-       * Secondly, the "Dispatch GRF Start Register for URB Data" field is
-       * limited to [0, 31] - which imposes a limit on the input vertices.
+      /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode. First, the
+       * "Instance" field limits the number of output vertices to [1, 16] on
+       * gen11 and below, or [1, 32] on gen12 and above. Secondly, the
+       * "Dispatch GRF Start Register for URB Data" field is limited to [0,
+       * 31] - which imposes a limit on the input vertices.
        */
       vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH;
       prog_data->instances = nir->info.tess.tcs_vertices_out;
@@ -476,7 +430,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
    if (is_scalar) {
       fs_visitor v(compiler, log_data, mem_ctx, &key->base,
-                   &prog_data->base.base, NULL, nir, 8,
+                   &prog_data->base.base, nir, 8,
                    shader_time_index, &input_vue_map);
       if (!v.run_tcs()) {
          if (error_str)
@@ -487,7 +441,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
 
       fs_generator g(compiler, log_data, mem_ctx,
-                     &prog_data->base.base, v.promoted_constants, false,
+                     &prog_data->base.base, v.shader_stats, false,
                      MESA_SHADER_TESS_CTRL);
       if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
          g.enable_debug(ralloc_asprintf(mem_ctx,
@@ -497,7 +451,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                                         nir->info.name));
       }
 
-      g.generate_code(v.cfg, 8);
+      g.generate_code(v.cfg, 8, stats);
 
       assembly = g.get_assembly();
    } else {
@@ -514,7 +468,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
 
       assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
-                                            &prog_data->base, v.cfg);
+                                            &prog_data->base, v.cfg, stats);
    }
 
    return assembly;