X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fnir_to_vir.c;h=689414551e97928b7b05bd2db5a7f75c6a8e4215;hb=1ccd681109e80516430a3be489dca1be15316d50;hp=7acdbd621d1bee1c6c0e5d05575c334026b8cdc1;hpb=ba520b00c4e0c8b403a20dd6a8978110e2423940;p=mesa.git diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 7acdbd621d1..689414551e9 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -22,7 +22,8 @@ */ #include -#include "util/u_format.h" +#include "util/format/u_format.h" +#include "util/u_helpers.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/ralloc.h" @@ -192,17 +193,28 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, * need/can to do things slightly different, like not loading the * amount to add/sub, as that is implicit. */ - bool atomic_add_replaced = ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || - instr->intrinsic == nir_intrinsic_shared_atomic_add) && - (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || - tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + bool atomic_add_replaced = + ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || + instr->intrinsic == nir_intrinsic_shared_atomic_add) && + (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || + tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || instr->intrinsic == nir_intrinsic_store_scratch || instr->intrinsic == nir_intrinsic_store_shared); + + bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || + instr->intrinsic == nir_intrinsic_load_ubo || + instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_scratch || + instr->intrinsic == nir_intrinsic_load_shared); + + if (!is_load) + c->tmu_dirty_rcl = true; + bool has_index = !is_shared_or_scratch; int offset_src; - int tmu_writes = 1; /* address */ if (instr->intrinsic == nir_intrinsic_load_uniform) { offset_src = 0; } else if (instr->intrinsic == nir_intrinsic_load_ssbo || @@ -213,25 +225,8 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, offset_src = 0 + has_index; } else if (is_store) { offset_src = 1 + has_index; - for (int i = 0; i < instr->num_components; i++) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[0], i)); - tmu_writes++; - } } else { offset_src = 0 + has_index; - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[1 + has_index], 0)); - tmu_writes++; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[2 + has_index], - 0)); - tmu_writes++; - } } bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); @@ -239,25 +234,20 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, if (!dynamic_src) const_offset = nir_src_as_uint(instr->src[offset_src]); - /* Make sure we won't exceed the 16-entry TMU fifo if each thread is - * storing at the same time. - */ - while (tmu_writes > 16 / c->threads) - c->threads /= 2; - - struct qreg offset; + struct qreg base_offset; if (instr->intrinsic == nir_intrinsic_load_uniform) { const_offset += nir_intrinsic_base(instr); - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(0, const_offset)); + base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(0, const_offset)); const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { uint32_t index = nir_src_as_uint(instr->src[0]) + 1; /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by * 1 (0 is gallium's constant buffer 0). */ - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(index, const_offset)); + base_offset = + vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(index, const_offset)); const_offset = 0; } else if (is_shared_or_scratch) { /* Shared and scratch variables have no buffer index, and all @@ -266,81 +256,152 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, */ if (instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_store_scratch) { - offset = c->spill_base; + base_offset = c->spill_base; } else { - offset = c->cs_shared_offset; + base_offset = c->cs_shared_offset; const_offset += nir_intrinsic_base(instr); } } else { - offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, + nir_src_as_uint(instr->src[is_store ? + 1 : 0])); } - /* The spec says that for atomics, the TYPE field is ignored, but that - * doesn't seem to be the case for CMPXCHG. Just use the number of - * tmud writes we did to decide the type (or choose "32bit" for atomic - * reads, which has been fine). - */ - int num_components; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) - num_components = 2; - else - num_components = instr->num_components; + struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); + unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; + uint32_t base_const_offset = const_offset; + int first_component = -1; + int last_component = -1; + do { + int tmu_writes = 1; /* address */ - uint32_t config = (0xffffff00 | - tmu_op << 3| - GENERAL_TMU_LOOKUP_PER_PIXEL); - if (num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; - } else { - config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; - } + if (is_store) { + /* Find the first set of consecutive components that + * are enabled in the writemask and emit the TMUD + * instructions for them. + */ + first_component = ffs(writemask) - 1; + last_component = first_component; + while (writemask & BITFIELD_BIT(last_component + 1)) + last_component++; + + assert(first_component >= 0 && + first_component <= last_component && + last_component < instr->num_components); + + struct qreg tmud = vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD); + for (int i = first_component; i <= last_component; i++) { + struct qreg data = + ntq_get_src(c, instr->src[0], i); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } - if (vir_in_nonuniform_control_flow(c)) { - vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - } + /* Update the offset for the TMU write based on the + * the first component we are writing. + */ + const_offset = base_const_offset + first_component * 4; + + /* Clear these components from the writemask */ + uint32_t written_mask = + BITFIELD_RANGE(first_component, tmu_writes - 1); + writemask &= ~written_mask; + } else if (!is_load && !atomic_add_replaced) { + struct qreg data = + ntq_get_src(c, instr->src[1 + has_index], 0); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { + data = ntq_get_src(c, instr->src[2 + has_index], + 0); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } + } - struct qreg tmua; - if (config == ~0) - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); - else - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + /* Make sure we won't exceed the 16-entry TMU fifo if each + * thread is storing at the same time. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; - struct qinst *tmu; - if (dynamic_src) { - if (const_offset != 0) { - offset = vir_ADD(c, offset, - vir_uniform_ui(c, const_offset)); + /* The spec says that for atomics, the TYPE field is ignored, + * but that doesn't seem to be the case for CMPXCHG. Just use + * the number of tmud writes we did to decide the type (or + * choose "32bit" for atomic reads, which has been fine). + */ + uint32_t num_components; + if (is_load || atomic_add_replaced) { + num_components = instr->num_components; + } else { + assert(tmu_writes > 1); + num_components = tmu_writes - 1; } - tmu = vir_ADD_dest(c, tmua, offset, - ntq_get_src(c, instr->src[offset_src], 0)); - } else { - if (const_offset != 0) { - tmu = vir_ADD_dest(c, tmua, offset, - vir_uniform_ui(c, const_offset)); + + uint32_t perquad = is_load + ? GENERAL_TMU_LOOKUP_PER_QUAD + : GENERAL_TMU_LOOKUP_PER_PIXEL; + uint32_t config = (0xffffff00 | + tmu_op << 3| + perquad); + if (num_components == 1) { + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; } else { - tmu = vir_MOV_dest(c, tmua, offset); + config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + + num_components - 2; } - } - if (config != ~0) { - tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, - config); - } + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } - if (vir_in_nonuniform_control_flow(c)) - vir_set_cond(tmu, V3D_QPU_COND_IFA); + struct qreg tmua; + if (config == ~0) + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); + else + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + + struct qinst *tmu; + if (dynamic_src) { + struct qreg offset = base_offset; + if (const_offset != 0) { + offset = vir_ADD(c, offset, + vir_uniform_ui(c, const_offset)); + } + struct qreg data = + ntq_get_src(c, instr->src[offset_src], 0); + tmu = vir_ADD_dest(c, tmua, offset, data); + } else { + if (const_offset != 0) { + tmu = vir_ADD_dest(c, tmua, base_offset, + vir_uniform_ui(c, const_offset)); + } else { + tmu = vir_MOV_dest(c, tmua, base_offset); + } + } - vir_emit_thrsw(c); + if (config != ~0) { + tmu->uniform = + vir_get_uniform_index(c, QUNIFORM_CONSTANT, + config); + } - /* Read the result, or wait for the TMU op to complete. */ - for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); + if (vir_in_nonuniform_control_flow(c)) + vir_set_cond(tmu, V3D_QPU_COND_IFA); - if (nir_intrinsic_dest_components(instr) == 0) - vir_TMUWT(c); + vir_emit_thrsw(c); + + /* Read the result, or wait for the TMU op to complete. */ + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { + ntq_store_dest(c, &instr->dest, i, + vir_MOV(c, vir_LDTMU(c))); + } + + if (nir_intrinsic_dest_components(instr) == 0) + vir_TMUWT(c); + } while (is_store && writemask != 0); } static struct qreg * @@ -352,6 +413,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) return qregs; } +static bool +is_ld_signal(const struct v3d_qpu_sig *sig) +{ + return (sig->ldunif || + sig->ldunifa || + sig->ldunifrf || + sig->ldunifarf || + sig->ldtmu || + sig->ldvary || + sig->ldvpm || + sig->ldtlb || + sig->ldtlbu); +} + /** * This function is responsible for getting VIR results into the associated * storage for a NIR instruction. @@ -372,7 +447,7 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, struct qreg result) { struct qinst *last_inst = NULL; - if (!list_empty(&c->cur_block->instructions)) + if (!list_is_empty(&c->cur_block->instructions)) last_inst = (struct qinst *)c->cur_block->instructions.prev; assert((result.file == QFILE_TEMP && @@ -399,11 +474,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; - /* Insert a MOV if the source wasn't an SSA def in the - * previous instruction. + /* If the previous instruction can't be predicated for + * the store into the nir_register, then emit a MOV + * that can be. */ - if ((vir_in_nonuniform_control_flow(c) && - c->defs[last_inst->dst.index]->qpu.sig.ldunif)) { + if (vir_in_nonuniform_control_flow(c) && + is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) { result = vir_MOV(c, result); last_inst = c->defs[result.index]; } @@ -664,6 +740,19 @@ emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, } } +static void +emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, + int array_index) +{ + /* Compact variables are scalar arrays where each set of 4 elements + * consumes a single location. + */ + int loc_offset = array_index / 4; + int chan = var->data.location_frac + array_index % 4; + c->inputs[(attr + loc_offset) * 4 + chan] = + emit_fragment_varying(c, var, chan, loc_offset); +} + static void add_output(struct v3d_compile *c, uint32_t decl_offset, @@ -944,7 +1033,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_sge: case nir_op_slt: { enum v3d_qpu_cond cond; - MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); + ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); assert(ok); result = vir_MOV(c, vir_SEL(c, cond, vir_uniform_f(c, 1.0), @@ -965,7 +1054,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_ilt32: case nir_op_ult32: { enum v3d_qpu_cond cond; - MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); + ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); assert(ok); result = vir_MOV(c, vir_SEL(c, cond, vir_uniform_ui(c, ~0), @@ -1295,11 +1384,20 @@ emit_frag_end(struct v3d_compile *c) vir_emit_tlb_color_write(c, rt); } +static inline void +vir_VPM_WRITE_indirect(struct v3d_compile *c, + struct qreg val, + struct qreg vpm_index) +{ + assert(c->devinfo->ver >= 40); + vir_STVPMV(c, vpm_index, val); +} + static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { if (c->devinfo->ver >= 40) { - vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); + vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index)); } else { /* XXX: v3d33_vir_vpm_write_setup(c); */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); @@ -1315,6 +1413,15 @@ emit_vert_end(struct v3d_compile *c) vir_VPMWT(c); } +static void +emit_geom_end(struct v3d_compile *c) +{ + /* GFXH-1684: VPM writes need to be complete by the end of the shader. + */ + if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + vir_VPMWT(c); +} + void v3d_optimize_nir(struct nir_shader *s) { @@ -1328,7 +1435,7 @@ v3d_optimize_nir(struct nir_shader *s) progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); @@ -1360,7 +1467,7 @@ v3d_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_undef); } while (progress); - NIR_PASS(progress, s, nir_opt_move_load_ubo); + NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo); } static int @@ -1369,6 +1476,9 @@ driver_location_compare(const void *in_a, const void *in_b) const nir_variable *const *a = in_a; const nir_variable *const *b = in_b; + if ((*a)->data.driver_location == (*b)->data.driver_location) + return (*a)->data.location_frac - (*b)->data.location_frac; + return (*a)->data.driver_location - (*b)->data.driver_location; } @@ -1402,7 +1512,7 @@ ntq_emit_vpm_read(struct v3d_compile *c, } static void -ntq_setup_vpm_inputs(struct v3d_compile *c) +ntq_setup_vs_inputs(struct v3d_compile *c) { /* Figure out how many components of each vertex attribute the shader * uses. Each variable should have been split to individual @@ -1410,7 +1520,7 @@ ntq_setup_vpm_inputs(struct v3d_compile *c) * from the start of the attribute to the number of components we * declare we need in c->vattr_sizes[]. */ - nir_foreach_variable(var, &c->s->inputs) { + nir_foreach_shader_in_variable(var, c->s) { /* No VS attribute array support. */ assert(MAX2(glsl_get_length(var->type), 1) == 1); @@ -1472,48 +1582,83 @@ ntq_setup_vpm_inputs(struct v3d_compile *c) } } -static bool -var_needs_point_coord(struct v3d_compile *c, nir_variable *var) -{ - return (var->data.location == VARYING_SLOT_PNTC || - (var->data.location >= VARYING_SLOT_VAR0 && - (c->fs_key->point_sprite_mask & - (1 << (var->data.location - VARYING_SLOT_VAR0))))); -} - static bool program_reads_point_coord(struct v3d_compile *c) { - nir_foreach_variable(var, &c->s->inputs) { - if (var_needs_point_coord(c, var)) + nir_foreach_shader_in_variable(var, c->s) { + if (util_varying_is_point_coord(var->data.location, + c->fs_key->point_sprite_mask)) { return true; + } } return false; } static void -ntq_setup_fs_inputs(struct v3d_compile *c) +get_sorted_input_variables(struct v3d_compile *c, + unsigned *num_entries, + nir_variable ***vars) { - unsigned num_entries = 0; - unsigned num_components = 0; - nir_foreach_variable(var, &c->s->inputs) { - num_entries++; - num_components += glsl_get_components(var->type); - } + *num_entries = 0; + nir_foreach_shader_in_variable(var, c->s) + (*num_entries)++; - nir_variable *vars[num_entries]; + *vars = ralloc_array(c, nir_variable *, *num_entries); unsigned i = 0; - nir_foreach_variable(var, &c->s->inputs) - vars[i++] = var; + nir_foreach_shader_in_variable(var, c->s) + (*vars)[i++] = var; /* Sort the variables so that we emit the input setup in * driver_location order. This is required for VPM reads, whose data * is fetched into the VPM in driver_location (TGSI register index) * order. */ - qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare); +} + +static void +ntq_setup_gs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + + /* All GS inputs are arrays with as many entries as vertices + * in the input primitive, but here we only care about the + * per-vertex input type. + */ + const struct glsl_type *type = glsl_without_array(var->type); + unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned loc = var->data.driver_location; + + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + array_len) * 4); + + for (unsigned j = 0; j < array_len; j++) { + unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned k = 0; k < num_elements; k++) { + unsigned chan = var->data.location_frac + k; + unsigned input_idx = c->num_inputs++; + struct v3d_varying_slot slot = + v3d_slot_from_slot_and_component(var->data.location + j, chan); + c->input_slots[input_idx] = slot; + } + } + } +} + + +static void +ntq_setup_fs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); for (unsigned i = 0; i < num_entries; i++) { nir_variable *var = vars[i]; @@ -1525,9 +1670,13 @@ ntq_setup_fs_inputs(struct v3d_compile *c) if (var->data.location == VARYING_SLOT_POS) { emit_fragcoord_input(c, loc); - } else if (var_needs_point_coord(c, var)) { + } else if (util_varying_is_point_coord(var->data.location, + c->fs_key->point_sprite_mask)) { c->inputs[loc * 4 + 0] = c->point_x; c->inputs[loc * 4 + 1] = c->point_y; + } else if (var->data.compact) { + for (int j = 0; j < array_len; j++) + emit_compact_fragment_input(c, loc, var, j); } else { for (int j = 0; j < array_len; j++) emit_fragment_input(c, loc + j, var, j); @@ -1541,7 +1690,7 @@ ntq_setup_outputs(struct v3d_compile *c) if (c->s->info.stage != MESA_SHADER_FRAGMENT) return; - nir_foreach_variable(var, &c->s->outputs) { + nir_foreach_shader_out_variable(var, c->s) { unsigned array_len = MAX2(glsl_get_length(var->type), 1); unsigned loc = var->data.driver_location * 4; @@ -1628,17 +1777,19 @@ ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) static void ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) { - assert(instr->intrinsic == nir_intrinsic_image_deref_size); - nir_variable *var = nir_intrinsic_get_var(instr, 0); - unsigned image_index = var->data.driver_location; - const struct glsl_type *sampler_type = glsl_without_array(var->type); - bool is_array = glsl_sampler_type_is_array(sampler_type); + unsigned image_index = nir_src_as_uint(instr->src[0]); + bool is_array = nir_intrinsic_image_array(instr); + + assert(nir_src_as_uint(instr->src[1]) == 0); ntq_store_dest(c, &instr->dest, 0, vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); if (instr->num_components > 1) { ntq_store_dest(c, &instr->dest, 1, - vir_uniform(c, QUNIFORM_IMAGE_HEIGHT, + vir_uniform(c, + instr->num_components == 2 && is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_HEIGHT, image_index)); } if (instr->num_components > 2) { @@ -1847,6 +1998,79 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) } } +static void +ntq_emit_per_sample_color_write(struct v3d_compile *c, + nir_intrinsic_instr *instr) +{ + assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d); + + unsigned rt = nir_src_as_uint(instr->src[1]); + assert(rt < V3D_MAX_DRAW_BUFFERS); + + unsigned sample_idx = nir_intrinsic_base(instr); + assert(sample_idx < V3D_MAX_SAMPLES); + + unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4; + for (int i = 0; i < instr->num_components; i++) { + c->sample_colors[offset + i] = + vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + } +} + +static void +ntq_emit_color_write(struct v3d_compile *c, + nir_intrinsic_instr *instr) +{ + unsigned offset = (nir_intrinsic_base(instr) + + nir_src_as_uint(instr->src[1])) * 4 + + nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + c->outputs[offset + i] = + vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + } +} + +static void +emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + assert(instr->num_components == 1); + + struct qreg offset = ntq_get_src(c, instr->src[1], 0); + + uint32_t base_offset = nir_intrinsic_base(instr); + + if (base_offset) + offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset); + + /* Usually, for VS or FS, we only emit outputs once at program end so + * our VPM writes are never in non-uniform control flow, but this + * is not true for GS, where we are emitting multiple vertices. + */ + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + + struct qreg val = ntq_get_src(c, instr->src[0], 0); + + /* The offset isn’t necessarily dynamically uniform for a geometry + * shader. This can happen if the shader sometimes doesn’t emit one of + * the vertices. In that case subsequent vertices will be written to + * different offsets in the VPM and we need to use the scatter write + * instruction to have a different offset for each lane. + */ + if (nir_src_is_dynamically_uniform(instr->src[1])) + vir_VPM_WRITE_indirect(c, val, offset); + else + vir_STVPMD(c, offset, val); + + if (vir_in_nonuniform_control_flow(c)) { + struct qinst *last_inst = + (struct qinst *)c->cur_block->instructions.prev; + vir_set_cond(last_inst, V3D_QPU_COND_IFA); + } +} + static void ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) { @@ -1855,19 +2079,16 @@ ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. */ if (c->s->info.stage == MESA_SHADER_FRAGMENT) { - unsigned offset = ((nir_intrinsic_base(instr) + - nir_src_as_uint(instr->src[1])) * 4 + - nir_intrinsic_component(instr)); - for (int i = 0; i < instr->num_components; i++) { - c->outputs[offset + i] = - vir_MOV(c, ntq_get_src(c, instr->src[0], i)); - } + ntq_emit_color_write(c, instr); + } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) { + emit_store_output_gs(c, instr); } else { - assert(instr->num_components == 1); + assert(c->s->info.stage == MESA_SHADER_VERTEX); + assert(instr->num_components == 1); - vir_VPM_WRITE(c, - ntq_get_src(c, instr->src[0], 0), - nir_intrinsic_base(instr)); + vir_VPM_WRITE(c, + ntq_get_src(c, instr->src[0], 0), + nir_intrinsic_base(instr)); } } @@ -1915,16 +2136,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_tmu_general(c, instr, true); break; - case nir_intrinsic_image_deref_load: - case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: v3d40_vir_emit_image_load_store(c, instr); break; @@ -1935,7 +2158,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_user_clip_plane: - for (int i = 0; i < instr->num_components; i++) { + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { ntq_store_dest(c, &instr->dest, i, vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, nir_intrinsic_ucp_id(instr) * @@ -1968,6 +2191,20 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); break; + case nir_intrinsic_load_line_coord: + ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); + break; + + case nir_intrinsic_load_line_width: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); + break; + + case nir_intrinsic_load_aa_line_width: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); + break; + case nir_intrinsic_load_sample_mask_in: ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); break; @@ -2006,11 +2243,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_load_input(c, instr); break; - case nir_intrinsic_store_output: + case nir_intrinsic_store_tlb_sample_color_v3d: + ntq_emit_per_sample_color_write(c, instr); + break; + + case nir_intrinsic_store_output: ntq_emit_store_output(c, instr); break; - case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_size: ntq_emit_image_size(c, instr); break; @@ -2048,10 +2289,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_memory_barrier_tcs_patch: case nir_intrinsic_group_memory_barrier: /* We don't do any instruction scheduling of these NIR * instructions between each other, so we just need to make @@ -2062,7 +2303,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) */ break; - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: /* Emit a TSY op to get all invocations in the workgroup * (actually supergroup) to block until the last invocation * reaches the TSY op. @@ -2119,6 +2360,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); break; + case nir_intrinsic_load_per_vertex_input: { + /* col: vertex index, row = varying index */ + struct qreg col = ntq_get_src(c, instr->src[0], 0); + uint32_t row_idx = nir_intrinsic_base(instr) * 4 + + nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + struct qreg row = vir_uniform_ui(c, row_idx++); + ntq_store_dest(c, &instr->dest, i, + vir_LDVPMG_IN(c, row, col)); + } + break; + } + + case nir_intrinsic_emit_vertex: + case nir_intrinsic_end_primitive: + unreachable("Should have been lowered in v3d_nir_lower_io"); + break; + + case nir_intrinsic_load_primitive_id: { + /* gl_PrimitiveIdIn is written by the GBG in the first word of + * VPM output header. According to docs, we should read this + * using ldvpm(v,d)_in (See Table 71). + */ + ntq_store_dest(c, &instr->dest, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + break; + } + + case nir_intrinsic_load_invocation_id: + ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); + break; + + case nir_intrinsic_load_fb_layers_v3d: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); + break; + + case nir_intrinsic_load_sample_id: + ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); + break; + default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); @@ -2314,6 +2596,12 @@ ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) case nir_jump_return: unreachable("All returns shouold be lowered\n"); + break; + + case nir_jump_goto: + case nir_jump_goto_if: + unreachable("not supported\n"); + break; } } @@ -2321,10 +2609,6 @@ static void ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) { switch (instr->type) { - case nir_instr_type_deref: - /* ignored, will be walked by the intrinsic using it. */ - break; - case nir_instr_type_alu: ntq_emit_alu(c, nir_instr_as_alu(instr)); break; @@ -2486,7 +2770,10 @@ nir_to_vir(struct v3d_compile *c) c->point_x = emit_fragment_varying(c, NULL, 0, 0); c->point_y = emit_fragment_varying(c, NULL, 0, 0); c->uses_implicit_point_line_varyings = true; - } else if (c->fs_key->is_lines && c->devinfo->ver < 40) { + } else if (c->fs_key->is_lines && + (c->devinfo->ver < 40 || + (c->s->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_LINE_COORD)))) { c->line_x = emit_fragment_varying(c, NULL, 0, 0); c->uses_implicit_point_line_varyings = true; } @@ -2541,10 +2828,21 @@ nir_to_vir(struct v3d_compile *c) c->spill_size += V3D_CHANNELS * c->s->scratch_size; } - if (c->s->info.stage == MESA_SHADER_FRAGMENT) + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + ntq_setup_vs_inputs(c); + break; + case MESA_SHADER_GEOMETRY: + ntq_setup_gs_inputs(c); + break; + case MESA_SHADER_FRAGMENT: ntq_setup_fs_inputs(c); - else - ntq_setup_vpm_inputs(c); + break; + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("unsupported shader stage"); + } ntq_setup_outputs(c); @@ -2588,6 +2886,7 @@ const nir_shader_compiler_options v3d_nir_options = { .lower_mul_high = true, .lower_wpos_pntc = true, .lower_rotate = true, + .lower_to_scalar = true, }; /** @@ -2689,6 +2988,9 @@ v3d_nir_to_vir(struct v3d_compile *c) case MESA_SHADER_FRAGMENT: emit_frag_end(c); break; + case MESA_SHADER_GEOMETRY: + emit_geom_end(c); + break; case MESA_SHADER_VERTEX: emit_vert_end(c); break; @@ -2744,10 +3046,15 @@ v3d_nir_to_vir(struct v3d_compile *c) break; if (c->threads == min_threads) { - fprintf(stderr, "Failed to register allocate at %d threads:\n", - c->threads); - vir_dump(c); - c->failed = true; + if (c->fallback_scheduler) { + fprintf(stderr, + "Failed to register allocate at %d " + "threads:\n", + c->threads); + vir_dump(c); + } + c->compilation_result = + V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; return; }