for (unsigned i = 0; i < 8; ++i) {
if (write_mask & (1 << i)) {
- ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
- ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
+ ctx->vsgs_output.mask[idx / 4u] |= 1 << (idx % 4u);
+ ctx->vsgs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
}
idx++;
}
return;
}
+void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned idx = (nir_intrinsic_base(instr) + component) * 4u;
+ Operand offset(s1);
+ Builder bld(ctx->program, ctx->block);
+
+ nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const)
+ offset = bld.v_mul24_imm(bld.def(v1), get_ssa_temp(ctx, instr->src[1].ssa), 16u);
+ else
+ idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 16u;
+
+ unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize;
+
+ Temp vertex_idx = emit_mbcnt(ctx, bld.def(v1));
+ Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
+ vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), vertex_idx,
+ bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
+
+ Temp lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
+ if (!offset.isUndefined())
+ lds_base = bld.vadd32(bld.def(v1), offset, lds_base);
+
+ unsigned align = 1 << (ffs(itemsize) - 1);
+ if (idx)
+ align = std::min(align, 1u << (ffs(idx) - 1));
+
+ unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+ store_lds(ctx, elem_size_bytes, src, write_mask, lds_base, idx, align);
+}
+
+void visit_store_gs_output_gfx9(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ /* This wouldn't work if it wasn't in the same block as the
+ * emit_vertex_with_counter intrinsic but that doesn't happen because of
+ * nir_lower_io_to_temporaries(). */
+
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+ unsigned component = nir_intrinsic_component(instr);
+ Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ unsigned idx = nir_intrinsic_base(instr) + component;
+
+ nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const)
+ unreachable("Indirect GS output stores should have been lowered");
+ idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
+
+ if (instr->src[0].ssa->bit_size == 64)
+ write_mask = widen_mask(write_mask, 2);
+
+ for (unsigned i = 0; i < 8; ++i) {
+ if (write_mask & (1 << i)) {
+ ctx->vsgs_output.mask[idx / 4u] |= 1 << (idx % 4u);
+ ctx->vsgs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
+ }
+ idx++;
+ }
+}
+
void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
if (ctx->stage == vertex_vs) {
visit_store_vs_output(ctx, instr);
} else if (ctx->stage == fragment_fs) {
visit_store_fs_output(ctx, instr);
+ } else if (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) {
+ visit_store_vsgs_output(ctx, instr);
+ } else if (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
+ visit_store_gs_output_gfx9(ctx, instr);
} else {
unreachable("Shader stage not implemented");
}
}
}
+void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+ assert(ctx->stage == vertex_geometry_gs);
+ assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
+
+ Builder bld(ctx->program, ctx->block);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+ Temp offset = Temp();
+ if (instr->src[0].ssa->parent_instr->type != nir_instr_type_load_const) {
+ /* better code could be created, but this case probably doesn't happen
+ * much in practice */
+ Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+ for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
+ Temp elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
+ if (i % 2u)
+ elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
+ if (offset.id()) {
+ Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(s2)),
+ Operand(i), indirect_vertex);
+ offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), offset, elem, cond);
+ } else {
+ offset = elem;
+ }
+ }
+ offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), offset);
+ } else {
+ unsigned vertex = nir_src_as_uint(instr->src[0]);
+ offset = bld.vop3(
+ aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
+ Operand((vertex % 2u) * 16u), Operand(16u));
+ }
+
+ unsigned const_offset = nir_intrinsic_base(instr);
+ const_offset += nir_intrinsic_component(instr);
+
+ nir_instr *off_instr = instr->src[1].ssa->parent_instr;
+ if (off_instr->type != nir_instr_type_load_const) {
+ Temp indirect_offset = get_ssa_temp(ctx, instr->src[1].ssa);
+ offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u),
+ bld.vadd32(bld.def(v1), indirect_offset, offset));
+ } else {
+ const_offset += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
+ }
+ const_offset *= 4u;
+
+ offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), offset);
+
+ unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize;
+ unsigned align = 16; /* alignment of indirect offset */
+ align = std::min(align, 1u << (ffs(itemsize) - 1));
+ if (const_offset)
+ align = std::min(align, 1u << (ffs(const_offset) - 1));
+
+ unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
+ load_lds(ctx, elem_size_bytes, dst, offset, const_offset, align);
+}
+
Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
{
if (ctx->program->info->need_indirect_descriptor_sets) {
switch(instr->intrinsic) {
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier:
- bld.barrier(aco_opcode::p_memory_barrier_all);
+ bld.barrier(aco_opcode::p_memory_barrier_common);
break;
case nir_intrinsic_memory_barrier_buffer:
bld.barrier(aco_opcode::p_memory_barrier_buffer);
bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
}
+void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
+ Builder bld(ctx->program, ctx->block);
+
+ unsigned stream = nir_intrinsic_stream_id(instr);
+ Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+ next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
+ nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
+
+ /* get GSVS ring */
+ Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
+
+ unsigned num_components =
+ ctx->program->info->gs.num_stream_output_components[stream];
+ assert(num_components);
+
+ unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
+ unsigned stream_offset = 0;
+ for (unsigned i = 0; i < stream; i++) {
+ unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
+ stream_offset += prev_stride * ctx->program->wave_size;
+ }
+
+ /* Limit on the stride field for <= GFX7. */
+ assert(stride < (1 << 14));
+
+ Temp gsvs_dwords[4];
+ for (unsigned i = 0; i < 4; i++)
+ gsvs_dwords[i] = bld.tmp(s1);
+ bld.pseudo(aco_opcode::p_split_vector,
+ Definition(gsvs_dwords[0]),
+ Definition(gsvs_dwords[1]),
+ Definition(gsvs_dwords[2]),
+ Definition(gsvs_dwords[3]),
+ gsvs_ring);
+
+ if (stream_offset) {
+ Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
+
+ Temp carry = bld.tmp(s1);
+ gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
+ gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
+ }
+
+ gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
+ gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
+
+ gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
+ gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
+
+ unsigned offset = 0;
+ for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
+ if (ctx->program->info->gs.output_streams[i] != stream)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
+ continue;
+
+ if (ctx->vsgs_output.mask[i] & (1 << j)) {
+ Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
+ unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
+ if (const_offset >= 4096u) {
+ if (vaddr_offset.isUndefined())
+ vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
+ else
+ vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
+ const_offset %= 4096u;
+ }
+
+ aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
+ mtbuf->operands[0] = vaddr_offset;
+ mtbuf->operands[1] = Operand(gsvs_ring);
+ mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
+ mtbuf->operands[3] = Operand(ctx->vsgs_output.outputs[i][j]);
+ mtbuf->offen = !vaddr_offset.isUndefined();
+ mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
+ mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+ mtbuf->offset = const_offset;
+ mtbuf->glc = true;
+ mtbuf->slc = true;
+ mtbuf->barrier = barrier_gs_data;
+ mtbuf->can_reorder = true;
+ bld.insert(std::move(mtbuf));
+ }
+
+ offset += ctx->shader->info.gs.vertices_out;
+ }
+
+ /* outputs for the next vertex are undefined and keeping them around can
+ * create invalid IR with control flow */
+ ctx->vsgs_output.mask[i] = 0;
+ }
+
+ Temp gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand((8u << 16) | 16u));
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(gs_wave_id), -1, sendmsg_gs(false, true, stream));
+}
+
Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
{
Builder bld(ctx->program, ctx->block);
case nir_intrinsic_load_input:
visit_load_input(ctx, instr);
break;
+ case nir_intrinsic_load_per_vertex_input:
+ visit_load_per_vertex_input(ctx, instr);
+ break;
case nir_intrinsic_load_ubo:
visit_load_ubo(ctx, instr);
break;
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
break;
}
+ case nir_intrinsic_load_invocation_id: {
+ assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ if (ctx->options->chip_class >= GFX10)
+ bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
+ else
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
+ break;
+ }
+ case nir_intrinsic_load_primitive_id: {
+ assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
+ break;
+ }
+ case nir_intrinsic_emit_vertex_with_counter: {
+ visit_emit_vertex_with_counter(ctx, instr);
+ break;
+ }
+ case nir_intrinsic_end_primitive_with_counter: {
+ Temp gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand((8u << 16) | 16u));
+ unsigned stream = nir_intrinsic_stream_id(instr);
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(gs_wave_id), -1, sendmsg_gs(true, false, stream));
+ break;
+ }
+ case nir_intrinsic_set_vertex_count: {
+ /* unused, the HW keeps track of this for us */
+ break;
+ }
default:
fprintf(stderr, "Unimplemented intrinsic instr: ");
nir_print_instr(&instr->instr, stderr);
static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
{
int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
- uint64_t mask = ctx->vs_output.mask[slot];
+ uint64_t mask = ctx->vsgs_output.mask[slot];
if (!is_pos && !mask)
return;
if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
exp->enabled_mask = mask;
for (unsigned i = 0; i < 4; ++i) {
if (mask & (1 << i))
- exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
+ exp->operands[i] = Operand(ctx->vsgs_output.outputs[slot][i]);
else
exp->operands[i] = Operand(v1);
}
exp->enabled_mask = 0;
for (unsigned i = 0; i < 4; ++i)
exp->operands[i] = Operand(v1);
- if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
- exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
+ if (ctx->vsgs_output.mask[VARYING_SLOT_PSIZ]) {
+ exp->operands[0] = Operand(ctx->vsgs_output.outputs[VARYING_SLOT_PSIZ][0]);
exp->enabled_mask |= 0x1;
}
- if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
- exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
+ if (ctx->vsgs_output.mask[VARYING_SLOT_LAYER]) {
+ exp->operands[2] = Operand(ctx->vsgs_output.outputs[VARYING_SLOT_LAYER][0]);
exp->enabled_mask |= 0x4;
}
- if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
+ if (ctx->vsgs_output.mask[VARYING_SLOT_VIEWPORT]) {
if (ctx->options->chip_class < GFX9) {
- exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
+ exp->operands[3] = Operand(ctx->vsgs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
exp->enabled_mask |= 0x8;
} else {
Builder bld(ctx->program, ctx->block);
Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
- Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
+ Operand(ctx->vsgs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
if (exp->operands[2].isTemp())
out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
if (outinfo->export_prim_id) {
- ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
- ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
+ ctx->vsgs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
+ ctx->vsgs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
}
if (ctx->options->key.has_multiview_view_index) {
- ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
- ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
+ ctx->vsgs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
+ ctx->vsgs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
}
/* the order these position exports are created is important */
bool all_undef = true;
assert(ctx->stage == vertex_vs);
for (unsigned i = 0; i < num_comps; i++) {
- out[i] = ctx->vs_output.outputs[loc][start + i];
+ out[i] = ctx->vsgs_output.outputs[loc][start + i];
all_undef = all_undef && !out[i].id();
}
if (all_undef)
Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
for (int i = 0; i < count; ++i)
- vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
+ vec->operands[i] = (ctx->vsgs_output.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
vec->definitions[0] = Definition(write_data);
ctx->block->instructions.emplace_back(std::move(vec));
if_context ic;
if (shader_count >= 2) {
Builder bld(ctx.program, ctx.block);
- Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
+ Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | (i * 8u)));
Temp thread_id = emit_mbcnt(&ctx, bld.def(v1));
Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id);
if (i) {
Builder bld(ctx.program, ctx.block);
- bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
+ assert(ctx.stage == vertex_geometry_gs);
+ bld.barrier(aco_opcode::p_memory_barrier_shared);
bld.sopp(aco_opcode::s_barrier);
}
nir_function_impl *func = nir_shader_get_entrypoint(nir);
visit_cf_list(&ctx, &func->body);
- if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
+ if (ctx.program->info->so.num_outputs && ctx.stage == vertex_vs)
emit_streamout(&ctx, 0);
- if (ctx.stage == vertex_vs)
+ if (ctx.stage == vertex_vs) {
create_vs_exports(&ctx);
+ } else if (nir->info.stage == MESA_SHADER_GEOMETRY && ctx.stage == vertex_geometry_gs) {
+ Builder bld(ctx.program, ctx.block);
+ Temp gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
+ bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+ bld.sopp(aco_opcode::s_sendmsg, bld.m0(gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
+ }
if (shader_count >= 2) {
begin_divergent_if_else(&ctx, &ic);
namespace aco {
-struct vs_output_state {
+struct ge_output_state {
uint8_t mask[VARYING_SLOT_VAR31 + 1];
Temp outputs[VARYING_SLOT_VAR31 + 1][4];
};
Temp arg_temps[AC_MAX_ARGS];
- /* inputs common for merged stages */
- Temp merged_wave_info = Temp(0, s1);
-
/* FS inputs */
Temp persp_centroid, linear_centroid;
/* VS inputs */
bool needs_instance_id;
+ /* gathered information */
+ uint64_t input_masks[MESA_SHADER_COMPUTE];
+ uint64_t output_masks[MESA_SHADER_COMPUTE];
+
/* VS output information */
unsigned num_clip_distances;
unsigned num_cull_distances;
- vs_output_state vs_output;
+
+ /* VS or GS output information */
+ ge_output_state vsgs_output;
};
Temp get_arg(isel_context *ctx, struct ac_arg arg)
case nir_intrinsic_load_sample_id:
case nir_intrinsic_load_sample_mask_in:
case nir_intrinsic_load_input:
+ case nir_intrinsic_load_per_vertex_input:
case nir_intrinsic_load_vertex_id:
case nir_intrinsic_load_vertex_id_zero_base:
case nir_intrinsic_load_barycentric_sample:
case nir_intrinsic_shared_atomic_exchange:
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_invocation_id:
+ case nir_intrinsic_load_primitive_id:
type = RegType::vgpr;
break;
case nir_intrinsic_shuffle:
}
nir_foreach_variable(variable, &nir->outputs)
{
- variable->data.driver_location = variable->data.location * 4;
+ if (ctx->stage == vertex_geometry_gs)
+ variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4;
+ else
+ variable->data.driver_location = variable->data.location * 4;
}
- radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
+ if (ctx->stage == vertex_vs) {
+ radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
- memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
- sizeof(outinfo->vs_output_param_offset));
+ memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+ sizeof(outinfo->vs_output_param_offset));
- ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id;
+ ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id;
- bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists;
+ bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists;
- outinfo->param_exports = 0;
- int pos_written = 0x1;
- if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
- pos_written |= 1 << 1;
+ outinfo->param_exports = 0;
+ int pos_written = 0x1;
+ if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
+ pos_written |= 1 << 1;
- nir_foreach_variable(variable, &nir->outputs)
- {
- int idx = variable->data.location;
- unsigned slots = variable->type->count_attribute_slots(false);
- if (variable->data.compact) {
- unsigned component_count = variable->data.location_frac + variable->type->length;
- slots = (component_count + 3) / 4;
- }
-
- if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
- ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
- for (unsigned i = 0; i < slots; i++) {
- if (outinfo->vs_output_param_offset[idx + i] == AC_EXP_PARAM_UNDEFINED)
- outinfo->vs_output_param_offset[idx + i] = outinfo->param_exports++;
+ uint64_t mask = ctx->output_masks[nir->info.stage];
+ while (mask) {
+ int idx = u_bit_scan64(&mask);
+ if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
+ ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
+ if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
+ outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
}
}
- }
- if (outinfo->writes_layer &&
- outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
- /* when ctx->options->key.has_multiview_view_index = true, the layer
- * variable isn't declared in NIR and it's isel's job to get the layer */
- outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
- }
+ if (outinfo->writes_layer &&
+ outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
+ /* when ctx->options->key.has_multiview_view_index = true, the layer
+ * variable isn't declared in NIR and it's isel's job to get the layer */
+ outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
+ }
- if (outinfo->export_prim_id) {
- assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
- outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
- }
+ if (outinfo->export_prim_id) {
+ assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
+ outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
+ }
- ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
- ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
+ ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
+ ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
- assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
+ assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
- if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
- pos_written |= 1 << 2;
- if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
- pos_written |= 1 << 3;
+ if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
+ pos_written |= 1 << 2;
+ if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
+ pos_written |= 1 << 3;
- outinfo->pos_exports = util_bitcount(pos_written);
+ outinfo->pos_exports = util_bitcount(pos_written);
+ } else if (ctx->stage == vertex_geometry_gs) {
+ /* TODO: radv_nir_shader_info_pass() already sets this but it's larger
+ * than it needs to be in order to set it better, we have to improve
+ * radv_nir_shader_info_pass() because gfx9_get_gs_info() uses
+ * esgs_itemsize and has to be done before compilation
+ */
+ /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info;
+ outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */
+ }
}
void
setup_vs_variables(ctx, nir);
break;
}
+ case MESA_SHADER_GEOMETRY: {
+ assert(ctx->stage == vertex_geometry_gs);
+ nir_foreach_variable(variable, &nir->inputs) {
+ variable->data.driver_location = util_bitcount64(ctx->input_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4;
+ }
+ nir_foreach_variable(variable, &nir->outputs) {
+ variable->data.driver_location = variable->data.location * 4;
+ }
+ ctx->program->info->gs.es_type = MESA_SHADER_VERTEX; /* tesselation shaders are not yet supported */
+ break;
+ }
default:
unreachable("Unhandled shader stage.");
}
}
+void
+get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const *shaders)
+{
+ for (unsigned i = 0; i < shader_count; i++) {
+ nir_shader *nir = shaders[i];
+ if (nir->info.stage == MESA_SHADER_COMPUTE)
+ continue;
+
+ uint64_t output_mask = 0;
+ nir_foreach_variable(variable, &nir->outputs) {
+ const glsl_type *type = variable->type;
+ if (nir_is_per_vertex_io(variable, nir->info.stage))
+ type = type->fields.array;
+ unsigned slots = type->count_attribute_slots(false);
+ if (variable->data.compact) {
+ unsigned component_count = variable->data.location_frac + type->length;
+ slots = (component_count + 3) / 4;
+ }
+ output_mask |= ((1ull << slots) - 1) << variable->data.location;
+ }
+
+ uint64_t input_mask = 0;
+ nir_foreach_variable(variable, &nir->inputs) {
+ const glsl_type *type = variable->type;
+ if (nir_is_per_vertex_io(variable, nir->info.stage))
+ type = type->fields.array;
+ unsigned slots = type->count_attribute_slots(false);
+ if (variable->data.compact) {
+ unsigned component_count = variable->data.location_frac + type->length;
+ slots = (component_count + 3) / 4;
+ }
+ input_mask |= ((1ull << slots) - 1) << variable->data.location;
+ }
+
+ ctx->output_masks[nir->info.stage] |= output_mask;
+ if (i + 1 < shader_count)
+ ctx->input_masks[shaders[i + 1]->info.stage] |= output_mask;
+
+ ctx->input_masks[nir->info.stage] |= input_mask;
+ if (i)
+ ctx->output_masks[shaders[i - 1]->info.stage] |= input_mask;
+ }
+}
+
isel_context
setup_isel_context(Program* program,
unsigned shader_count,
unreachable("Shader stage not implemented");
}
}
+ bool gfx9_plus = args->options->chip_class >= GFX9;
+ bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
if (program->stage == sw_vs)
program->stage |= hw_vs;
else if (program->stage == sw_fs)
program->stage |= hw_fs;
else if (program->stage == sw_cs)
program->stage |= hw_cs;
+ else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg)
+ program->stage |= hw_gs;
else
unreachable("Shader stage not implemented");
ctx.options = args->options;
ctx.stage = program->stage;
+ get_io_masks(&ctx, shader_count, shaders);
+
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];