#include "brw_nir.h"
#include "brw_vec4_tcs.h"
#include "brw_fs.h"
-#include "common/gen_debug.h"
+#include "dev/gen_debug.h"
namespace brw {
void *mem_ctx,
int shader_time_index,
const struct brw_vue_map *input_vue_map)
- : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+ : vec4_visitor(compiler, log_data, &key->base.tex, &prog_data->base,
nir, mem_ctx, false, shader_time_index),
input_vue_map(input_vue_map), key(key)
{
brw_imm_d(key->input_vertices)));
break;
case nir_intrinsic_load_per_vertex_input: {
+ assert(nir_dest_bit_size(instr->dest) == 32);
src_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = instr->const_index[0];
BRW_REGISTER_TYPE_UD);
unsigned first_component = nir_intrinsic_component(instr);
- if (nir_dest_bit_size(instr->dest) == 64) {
- /* We need to emit up to two 32-bit URB reads, then shuffle
- * the result into a temporary, then move to the destination
- * honoring the writemask
- *
- * We don't need to divide first_component by 2 because
- * emit_input_urb_read takes a 32-bit type.
- */
- dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
- dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
- emit_input_urb_read(tmp_d, vertex_index, imm_offset,
- first_component, indirect_offset);
- if (instr->num_components > 2) {
- emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index,
- imm_offset + 1, 0, indirect_offset);
- }
-
- src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF);
- dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
- shuffle_64bit_data(shuffled, tmp_src, false);
-
- dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
- dst.writemask = brw_writemask_for_size(instr->num_components);
- emit(MOV(dst, src_reg(shuffled)));
- } else {
- dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
- dst.writemask = brw_writemask_for_size(instr->num_components);
- emit_input_urb_read(dst, vertex_index, imm_offset,
- first_component, indirect_offset);
- }
+ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+ dst.writemask = brw_writemask_for_size(instr->num_components);
+ emit_input_urb_read(dst, vertex_index, imm_offset,
+ first_component, indirect_offset);
break;
}
case nir_intrinsic_load_input:
}
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output: {
+ assert(nir_src_bit_size(instr->src[0]) == 32);
src_reg value = get_nir_src(instr->src[0]);
unsigned mask = instr->const_index[1];
unsigned swiz = BRW_SWIZZLE_XYZW;
unsigned first_component = nir_intrinsic_component(instr);
if (first_component) {
- if (nir_src_bit_size(instr->src[0]) == 64)
- first_component /= 2;
assert(swiz == BRW_SWIZZLE_XYZW);
swiz = BRW_SWZ_COMP_OUTPUT(first_component);
mask = mask << first_component;
}
- if (nir_src_bit_size(instr->src[0]) == 64) {
- /* For 64-bit data we need to shuffle the data before we write and
- * emit two messages. Also, since each channel is twice as large we
- * need to fix the writemask in each 32-bit message to account for it.
- */
- value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz);
- dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
- shuffle_64bit_data(shuffled, value, true);
- src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
-
- for (int n = 0; n < 2; n++) {
- unsigned fixed_mask = 0;
- if (mask & WRITEMASK_X)
- fixed_mask |= WRITEMASK_XY;
- if (mask & WRITEMASK_Y)
- fixed_mask |= WRITEMASK_ZW;
- emit_urb_write(shuffled_float, fixed_mask,
- imm_offset, indirect_offset);
-
- shuffled_float = byte_offset(shuffled_float, REG_SIZE);
- mask >>= 2;
- imm_offset++;
- }
- } else {
- emit_urb_write(swizzle(value, swiz), mask,
- imm_offset, indirect_offset);
- }
+ emit_urb_write(swizzle(value, swiz), mask,
+ imm_offset, indirect_offset);
break;
}
break;
}
+ case nir_intrinsic_memory_barrier_tcs_patch:
+ break;
+
default:
vec4_visitor::nir_emit_intrinsic(instr);
}
void *mem_ctx,
const struct brw_tcs_prog_key *key,
struct brw_tcs_prog_data *prog_data,
- const nir_shader *src_shader,
+ nir_shader *nir,
int shader_time_index,
+ struct brw_compile_stats *stats,
char **error_str)
{
const struct gen_device_info *devinfo = compiler->devinfo;
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
const unsigned *assembly;
- nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
nir->info.outputs_written = key->outputs_written;
nir->info.patch_outputs_written = key->patch_outputs_written;
nir->info.outputs_written,
nir->info.patch_outputs_written);
- nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
+ brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
brw_nir_lower_vue_inputs(nir, &input_vue_map);
brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
key->tes_primitive_mode);
if (key->quads_workaround)
brw_nir_apply_tcs_quads_workaround(nir);
- nir = brw_postprocess_nir(nir, compiler, is_scalar);
+ brw_postprocess_nir(nir, compiler, is_scalar);
+
+ bool has_primitive_id =
+ nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID);
- if (is_scalar)
- prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8);
- else
- prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2);
+ if (compiler->use_tcs_8_patch &&
+ nir->info.tess.tcs_vertices_out <= (devinfo->gen >= 12 ? 32 : 16) &&
+ 2 + has_primitive_id + key->input_vertices <= 31) {
+ /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode. First, the
+ * "Instance" field limits the number of output vertices to [1, 16] on
+ * gen11 and below, or [1, 32] on gen12 and above. Secondly, the
+ * "Dispatch GRF Start Register for URB Data" field is limited to [0,
+ * 31] - which imposes a limit on the input vertices.
+ */
+ vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH;
+ prog_data->instances = nir->info.tess.tcs_vertices_out;
+ prog_data->include_primitive_id = has_primitive_id;
+ } else {
+ unsigned verts_per_thread = is_scalar ? 8 : 2;
+ vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_SINGLE_PATCH;
+ prog_data->instances =
+ DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
+ }
/* Compute URB entry size. The maximum allowed URB entry size is 32k.
* That divides up as follows:
}
if (is_scalar) {
- fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
- &prog_data->base.base, NULL, nir, 8,
+ fs_visitor v(compiler, log_data, mem_ctx, &key->base,
+ &prog_data->base.base, nir, 8,
shader_time_index, &input_vue_map);
- if (!v.run_tcs_single_patch()) {
+ if (!v.run_tcs()) {
if (error_str)
*error_str = ralloc_strdup(mem_ctx, v.fail_msg);
return NULL;
}
prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
- prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
fs_generator g(compiler, log_data, mem_ctx,
- &prog_data->base.base, v.promoted_constants, false,
+ &prog_data->base.base, v.shader_stats, false,
MESA_SHADER_TESS_CTRL);
if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
g.enable_debug(ralloc_asprintf(mem_ctx,
nir->info.name));
}
- g.generate_code(v.cfg, 8);
+ g.generate_code(v.cfg, 8, stats);
assembly = g.get_assembly();
} else {
assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
- &prog_data->base, v.cfg);
+ &prog_data->base, v.cfg, stats);
}
return assembly;