X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fnir_to_vir.c;h=6c079bda6af9df368040473b357fe7f63d729935;hb=8afab607ac37871771cd75ac9dfdaea3bea65d25;hp=3857f96a9bd8830717b586557d0df15d4d11050d;hpb=3d65d2a4883bcf0cdc2eb3a2eeafda1d3c784b9b;p=mesa.git diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 3857f96a9bd..6c079bda6af 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -22,7 +22,7 @@ */ #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/ralloc.h" @@ -192,10 +192,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, * need/can to do things slightly different, like not loading the * amount to add/sub, as that is implicit. */ - bool atomic_add_replaced = ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || - instr->intrinsic == nir_intrinsic_shared_atomic_add) && - (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || - tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); + bool atomic_add_replaced = + ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || + instr->intrinsic == nir_intrinsic_shared_atomic_add) && + (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || + tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || instr->intrinsic == nir_intrinsic_store_scratch || @@ -207,6 +208,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_load_shared); + if (!is_load) + c->tmu_dirty_rcl = true; + bool has_index = !is_shared_or_scratch; int offset_src; @@ -229,19 +233,20 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, if (!dynamic_src) const_offset = nir_src_as_uint(instr->src[offset_src]); - struct qreg offset; + struct qreg base_offset; if (instr->intrinsic == nir_intrinsic_load_uniform) { const_offset += nir_intrinsic_base(instr); - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(0, const_offset)); + base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(0, const_offset)); const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { uint32_t index = nir_src_as_uint(instr->src[0]) + 1; /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by * 1 (0 is gallium's constant buffer 0). */ - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - v3d_unit_data_create(index, const_offset)); + base_offset = + vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(index, const_offset)); const_offset = 0; } else if (is_shared_or_scratch) { /* Shared and scratch variables have no buffer index, and all @@ -250,109 +255,149 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, */ if (instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_store_scratch) { - offset = c->spill_base; + base_offset = c->spill_base; } else { - offset = c->cs_shared_offset; + base_offset = c->cs_shared_offset; const_offset += nir_intrinsic_base(instr); } } else { - offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, + nir_src_as_uint(instr->src[is_store ? + 1 : 0])); } - int tmu_writes = 1; /* address */ - if (is_store) { - for (int i = 0; i < instr->num_components; i++) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[0], i)); - tmu_writes++; - } - } else if (!is_load && !atomic_add_replaced) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[1 + has_index], 0)); - tmu_writes++; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[2 + has_index], - 0)); - tmu_writes++; - } - } + struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); + unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; + uint32_t base_const_offset = const_offset; + int first_component = -1; + int last_component = -1; + do { + int tmu_writes = 1; /* address */ - /* Make sure we won't exceed the 16-entry TMU fifo if each thread is - * storing at the same time. - */ - while (tmu_writes > 16 / c->threads) - c->threads /= 2; + if (is_store) { + /* Find the first set of consecutive components that + * are enabled in the writemask and emit the TMUD + * instructions for them. + */ + first_component = ffs(writemask) - 1; + last_component = first_component; + while (writemask & BITFIELD_BIT(last_component + 1)) + last_component++; + + assert(first_component >= 0 && + first_component <= last_component && + last_component < instr->num_components); + + struct qreg tmud = vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD); + for (int i = first_component; i <= last_component; i++) { + struct qreg data = + ntq_get_src(c, instr->src[0], i); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } - /* The spec says that for atomics, the TYPE field is ignored, but that - * doesn't seem to be the case for CMPXCHG. Just use the number of - * tmud writes we did to decide the type (or choose "32bit" for atomic - * reads, which has been fine). - */ - int num_components; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) - num_components = 2; - else - num_components = instr->num_components; + /* Update the offset for the TMU write based on the + * the first component we are writing. + */ + const_offset = base_const_offset + first_component * 4; + + /* Clear these components from the writemask */ + uint32_t written_mask = + BITFIELD_RANGE(first_component, tmu_writes - 1); + writemask &= ~written_mask; + } else if (!is_load && !atomic_add_replaced) { + struct qreg data = + ntq_get_src(c, instr->src[1 + has_index], 0); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { + data = ntq_get_src(c, instr->src[2 + has_index], + 0); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } + } - uint32_t config = (0xffffff00 | - tmu_op << 3| - GENERAL_TMU_LOOKUP_PER_PIXEL); - if (num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; - } else { - config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; - } + /* Make sure we won't exceed the 16-entry TMU fifo if each + * thread is storing at the same time. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; - if (vir_in_nonuniform_control_flow(c)) { - vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - } + /* The spec says that for atomics, the TYPE field is ignored, + * but that doesn't seem to be the case for CMPXCHG. Just use + * the number of tmud writes we did to decide the type (or + * choose "32bit" for atomic reads, which has been fine). + */ + uint32_t num_components; + if (is_load || atomic_add_replaced) { + num_components = instr->num_components; + } else { + assert(tmu_writes > 1); + num_components = tmu_writes - 1; + } - struct qreg tmua; - if (config == ~0) - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); - else - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + uint32_t config = (0xffffff00 | + tmu_op << 3| + GENERAL_TMU_LOOKUP_PER_PIXEL); + if (num_components == 1) { + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + } else { + config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + + num_components - 2; + } - struct qinst *tmu; - if (dynamic_src) { - if (const_offset != 0) { - offset = vir_ADD(c, offset, - vir_uniform_ui(c, const_offset)); + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); } - tmu = vir_ADD_dest(c, tmua, offset, - ntq_get_src(c, instr->src[offset_src], 0)); - } else { - if (const_offset != 0) { - tmu = vir_ADD_dest(c, tmua, offset, - vir_uniform_ui(c, const_offset)); + + struct qreg tmua; + if (config == ~0) + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); + else + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + + struct qinst *tmu; + if (dynamic_src) { + struct qreg offset = base_offset; + if (const_offset != 0) { + offset = vir_ADD(c, offset, + vir_uniform_ui(c, const_offset)); + } + struct qreg data = + ntq_get_src(c, instr->src[offset_src], 0); + tmu = vir_ADD_dest(c, tmua, offset, data); } else { - tmu = vir_MOV_dest(c, tmua, offset); + if (const_offset != 0) { + tmu = vir_ADD_dest(c, tmua, base_offset, + vir_uniform_ui(c, const_offset)); + } else { + tmu = vir_MOV_dest(c, tmua, base_offset); + } } - } - if (config != ~0) { - tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, - config); - } + if (config != ~0) { + tmu->uniform = + vir_get_uniform_index(c, QUNIFORM_CONSTANT, + config); + } - if (vir_in_nonuniform_control_flow(c)) - vir_set_cond(tmu, V3D_QPU_COND_IFA); + if (vir_in_nonuniform_control_flow(c)) + vir_set_cond(tmu, V3D_QPU_COND_IFA); - vir_emit_thrsw(c); + vir_emit_thrsw(c); - /* Read the result, or wait for the TMU op to complete. */ - for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); + /* Read the result, or wait for the TMU op to complete. */ + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { + ntq_store_dest(c, &instr->dest, i, + vir_MOV(c, vir_LDTMU(c))); + } - if (nir_intrinsic_dest_components(instr) == 0) - vir_TMUWT(c); + if (nir_intrinsic_dest_components(instr) == 0) + vir_TMUWT(c); + } while (is_store && writemask != 0); } static struct qreg * @@ -364,6 +409,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) return qregs; } +static bool +is_ld_signal(const struct v3d_qpu_sig *sig) +{ + return (sig->ldunif || + sig->ldunifa || + sig->ldunifrf || + sig->ldunifarf || + sig->ldtmu || + sig->ldvary || + sig->ldvpm || + sig->ldtlb || + sig->ldtlbu); +} + /** * This function is responsible for getting VIR results into the associated * storage for a NIR instruction. @@ -384,7 +443,7 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, struct qreg result) { struct qinst *last_inst = NULL; - if (!list_empty(&c->cur_block->instructions)) + if (!list_is_empty(&c->cur_block->instructions)) last_inst = (struct qinst *)c->cur_block->instructions.prev; assert((result.file == QFILE_TEMP && @@ -411,11 +470,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; - /* Insert a MOV if the source wasn't an SSA def in the - * previous instruction. + /* If the previous instruction can't be predicated for + * the store into the nir_register, then emit a MOV + * that can be. */ - if ((vir_in_nonuniform_control_flow(c) && - c->defs[last_inst->dst.index]->qpu.sig.ldunif)) { + if (vir_in_nonuniform_control_flow(c) && + is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) { result = vir_MOV(c, result); last_inst = c->defs[result.index]; } @@ -1340,7 +1400,7 @@ v3d_optimize_nir(struct nir_shader *s) progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); @@ -1938,8 +1998,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -2624,6 +2686,7 @@ const nir_shader_compiler_options v3d_nir_options = { .lower_mul_high = true, .lower_wpos_pntc = true, .lower_rotate = true, + .lower_to_scalar = true, }; /**