From: Eric Anholt Date: Fri, 3 Feb 2017 18:24:14 +0000 (-0800) Subject: broadcom: Add VC5 NIR compiler. X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=ade416d02369cc0942d53ad3cce601d66344f9c3 broadcom: Add VC5 NIR compiler. This is a pretty straightforward fork of VC4's NIR compiler to VC5. The condition codes, registers, and I/O have all changed, making the backend hard to share, though their heritage is still recognizable. v2: Move to src/broadcom/compiler to match intel's layout, rename more "vc5" to "v3d", rename QIR to VIR ("V3D IR") to avoid symbol conflicts with vc4, use new v3d_debug header, add compiler init/free functions, do texture swizzling in NIR to allow optimization. --- diff --git a/src/broadcom/Makefile.am b/src/broadcom/Makefile.am index ce2fd7df41f..01b485e7d41 100644 --- a/src/broadcom/Makefile.am +++ b/src/broadcom/Makefile.am @@ -26,6 +26,8 @@ AM_CPPFLAGS = \ -I$(top_srcdir)/src \ -I$(top_srcdir)/src/broadcom/ \ -I$(top_srcdir)/src/broadcom/include \ + -I$(top_srcdir)/src/gallium/auxiliary \ + -I$(top_srcdir)/src/gallium/include \ $(VALGRIND_CFLAGS) \ $(DEFINES) diff --git a/src/broadcom/Makefile.sources b/src/broadcom/Makefile.sources index 626290b8000..b60d2bcaa5d 100644 --- a/src/broadcom/Makefile.sources +++ b/src/broadcom/Makefile.sources @@ -16,6 +16,19 @@ BROADCOM_FILES = \ clif/clif_dump.c \ clif/clif_dump.h \ common/v3d_device_info.h \ + compiler/nir_to_vir.c \ + compiler/vir.c \ + compiler/vir_dump.c \ + compiler/vir_live_variables.c \ + compiler/vir_lower_uniforms.c \ + compiler/vir_opt_copy_propagate.c \ + compiler/vir_opt_dead_code.c \ + compiler/vir_register_allocate.c \ + compiler/vir_to_qpu.c \ + compiler/qpu_schedule.c \ + compiler/qpu_validate.c \ + compiler/v3d_compiler.h \ + compiler/v3d_nir_lower_io.c \ qpu/qpu_disasm.c \ qpu/qpu_disasm.h \ qpu/qpu_instr.c \ diff --git a/src/broadcom/Makefile.vc5.am b/src/broadcom/Makefile.vc5.am index e88afc20423..3e8e28bc947 100644 --- a/src/broadcom/Makefile.vc5.am +++ b/src/broadcom/Makefile.vc5.am @@ -13,6 +13,7 @@ check_PROGRAMS += \ LDADD = \ libbroadcom.la \ + $(top_builddir)/src/compiler/nir/libnir.la \ $(top_builddir)/src/util/libmesautil.la \ $(NULL) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c new file mode 100644 index 00000000000..3fd914fa863 --- /dev/null +++ b/src/broadcom/compiler/nir_to_vir.c @@ -0,0 +1,1963 @@ +/* + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/ralloc.h" +#include "util/hash_table.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "v3d_compiler.h" + +/* We don't do any address packing. */ +#define __gen_user_data void +#define __gen_address_type uint32_t +#define __gen_address_offset(reloc) (*reloc) +#define __gen_emit_reloc(cl, reloc) +#include "cle/v3d_packet_v33_pack.h" + +static struct qreg +ntq_get_src(struct v3d_compile *c, nir_src src, int i); +static void +ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); + +static void +resize_qreg_array(struct v3d_compile *c, + struct qreg **regs, + uint32_t *size, + uint32_t decl_size) +{ + if (*size >= decl_size) + return; + + uint32_t old_size = *size; + *size = MAX2(*size * 2, decl_size); + *regs = reralloc(c, *regs, struct qreg, *size); + if (!*regs) { + fprintf(stderr, "Malloc failure\n"); + abort(); + } + + for (uint32_t i = old_size; i < *size; i++) + (*regs)[i] = c->undef; +} + +static struct qreg +vir_SFU(struct v3d_compile *c, int waddr, struct qreg src) +{ + vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src); + return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); +} + +static struct qreg +vir_LDTMU(struct v3d_compile *c) +{ + vir_NOP(c)->qpu.sig.ldtmu = true; + return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); +} + +static struct qreg +indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr) +{ + struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); + uint32_t offset = nir_intrinsic_base(intr); + struct v3d_ubo_range *range = NULL; + unsigned i; + + for (i = 0; i < c->num_ubo_ranges; i++) { + range = &c->ubo_ranges[i]; + if (offset >= range->src_offset && + offset < range->src_offset + range->size) { + break; + } + } + /* The driver-location-based offset always has to be within a declared + * uniform range. + */ + assert(i != c->num_ubo_ranges); + if (!c->ubo_range_used[i]) { + c->ubo_range_used[i] = true; + range->dst_offset = c->next_ubo_dst_offset; + c->next_ubo_dst_offset += range->size; + } + + offset -= range->src_offset; + + if (range->dst_offset + offset != 0) { + indirect_offset = vir_ADD(c, indirect_offset, + vir_uniform_ui(c, range->dst_offset + + offset)); + } + + /* Adjust for where we stored the TGSI register base. */ + vir_ADD_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), + vir_uniform(c, QUNIFORM_UBO_ADDR, 0), + indirect_offset); + + return vir_LDTMU(c); +} + +static struct qreg * +ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) +{ + struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, + def->num_components); + _mesa_hash_table_insert(c->def_ht, def, qregs); + return qregs; +} + +/** + * This function is responsible for getting VIR results into the associated + * storage for a NIR instruction. + * + * If it's a NIR SSA def, then we just set the associated hash table entry to + * the new result. + * + * If it's a NIR reg, then we need to update the existing qreg assigned to the + * NIR destination with the incoming value. To do that without introducing + * new MOVs, we require that the incoming qreg either be a uniform, or be + * SSA-defined by the previous VIR instruction in the block and rewritable by + * this function. That lets us sneak ahead and insert the SF flag beforehand + * (knowing that the previous instruction doesn't depend on flags) and rewrite + * its destination to be the NIR reg's destination + */ +static void +ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, + struct qreg result) +{ + struct qinst *last_inst = NULL; + if (!list_empty(&c->cur_block->instructions)) + last_inst = (struct qinst *)c->cur_block->instructions.prev; + + assert(result.file == QFILE_UNIF || + (result.file == QFILE_TEMP && + last_inst && last_inst == c->defs[result.index])); + + if (dest->is_ssa) { + assert(chan < dest->ssa.num_components); + + struct qreg *qregs; + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, &dest->ssa); + + if (entry) + qregs = entry->data; + else + qregs = ntq_init_ssa_def(c, &dest->ssa); + + qregs[chan] = result; + } else { + nir_register *reg = dest->reg.reg; + assert(dest->reg.base_offset == 0); + assert(reg->num_array_elems == 0); + struct hash_entry *entry = + _mesa_hash_table_search(c->def_ht, reg); + struct qreg *qregs = entry->data; + + /* Insert a MOV if the source wasn't an SSA def in the + * previous instruction. + */ + if (result.file == QFILE_UNIF) { + result = vir_MOV(c, result); + last_inst = c->defs[result.index]; + } + + /* We know they're both temps, so just rewrite index. */ + c->defs[last_inst->dst.index] = NULL; + last_inst->dst.index = qregs[chan].index; + + /* If we're in control flow, then make this update of the reg + * conditional on the execution mask. + */ + if (c->execute.file != QFILE_NULL) { + last_inst->dst.index = qregs[chan].index; + + /* Set the flags to the current exec mask. To insert + * the flags push, we temporarily remove our SSA + * instruction. + */ + list_del(&last_inst->link); + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + list_addtail(&last_inst->link, + &c->cur_block->instructions); + + vir_set_cond(last_inst, V3D_QPU_COND_IFA); + last_inst->cond_is_exec_mask = true; + } + } +} + +static struct qreg +ntq_get_src(struct v3d_compile *c, nir_src src, int i) +{ + struct hash_entry *entry; + if (src.is_ssa) { + entry = _mesa_hash_table_search(c->def_ht, src.ssa); + assert(i < src.ssa->num_components); + } else { + nir_register *reg = src.reg.reg; + entry = _mesa_hash_table_search(c->def_ht, reg); + assert(reg->num_array_elems == 0); + assert(src.reg.base_offset == 0); + assert(i < reg->num_components); + } + + struct qreg *qregs = entry->data; + return qregs[i]; +} + +static struct qreg +ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, + unsigned src) +{ + assert(util_is_power_of_two(instr->dest.write_mask)); + unsigned chan = ffs(instr->dest.write_mask) - 1; + struct qreg r = ntq_get_src(c, instr->src[src].src, + instr->src[src].swizzle[chan]); + + assert(!instr->src[src].abs); + assert(!instr->src[src].negate); + + return r; +}; + +static inline struct qreg +vir_SAT(struct v3d_compile *c, struct qreg val) +{ + return vir_FMAX(c, + vir_FMIN(c, val, vir_uniform_f(c, 1.0)), + vir_uniform_f(c, 0.0)); +} + +static struct qreg +ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1) +{ + vir_MULTOP(c, src0, src1); + return vir_UMUL24(c, src0, src1); +} + +static struct qreg +ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) +{ + return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); +} + +static void +ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) +{ + unsigned unit = instr->texture_index; + int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); + int dest_size = nir_tex_instr_dest_size(instr); + + struct qreg lod = c->undef; + if (lod_index != -1) + lod = ntq_get_src(c, instr->src[lod_index].src, 0); + + for (int i = 0; i < dest_size; i++) { + assert(i < 3); + enum quniform_contents contents; + + if (instr->is_array && i == dest_size - 1) + contents = QUNIFORM_TEXTURE_ARRAY_SIZE; + else + contents = QUNIFORM_TEXTURE_WIDTH + i; + + struct qreg size = vir_uniform(c, contents, unit); + + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + /* Don't minify the array size. */ + if (!(instr->is_array && i == dest_size - 1)) { + size = ntq_minify(c, size, lod); + } + break; + + case GLSL_SAMPLER_DIM_RECT: + /* There's no LOD field for rects */ + break; + + default: + unreachable("Bad sampler type"); + } + + ntq_store_dest(c, &instr->dest, i, size); + } +} + +static void +ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) +{ + unsigned unit = instr->texture_index; + + /* Since each texture sampling op requires uploading uniforms to + * reference the texture, there's no HW support for texture size and + * you just upload uniforms containing the size. + */ + switch (instr->op) { + case nir_texop_query_levels: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); + return; + case nir_texop_txs: + ntq_emit_txs(c, instr); + return; + default: + break; + } + + struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = { + V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header, + + .fetch_sample_mode = instr->op == nir_texop_txf, + }; + + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + if (instr->is_array) + p0_unpacked.lookup_type = TEXTURE_1D_ARRAY; + else + p0_unpacked.lookup_type = TEXTURE_1D; + break; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + if (instr->is_array) + p0_unpacked.lookup_type = TEXTURE_2D_ARRAY; + else + p0_unpacked.lookup_type = TEXTURE_2D; + break; + case GLSL_SAMPLER_DIM_3D: + p0_unpacked.lookup_type = TEXTURE_3D; + break; + case GLSL_SAMPLER_DIM_CUBE: + p0_unpacked.lookup_type = TEXTURE_CUBE_MAP; + break; + default: + unreachable("Bad sampler type"); + } + + struct qreg coords[5]; + int next_coord = 0; + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + for (int j = 0; j < instr->coord_components; j++) { + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, j); + } + if (instr->coord_components < 2) + coords[next_coord++] = vir_uniform_f(c, 0.5); + break; + case nir_tex_src_bias: + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, 0); + + p0_unpacked.bias_supplied = true; + break; + case nir_tex_src_lod: + /* XXX: Needs base level addition */ + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, 0); + + if (instr->op != nir_texop_txf && + instr->op != nir_texop_tg4) { + p0_unpacked.disable_autolod_use_bias_only = true; + } + break; + case nir_tex_src_comparator: + coords[next_coord++] = + ntq_get_src(c, instr->src[i].src, 0); + + p0_unpacked.shadow = true; + break; + + case nir_tex_src_offset: { + nir_const_value *offset = + nir_src_as_const_value(instr->src[i].src); + p0_unpacked.texel_offset_for_s_coordinate = + offset->i32[0]; + + if (instr->coord_components >= 2) + p0_unpacked.texel_offset_for_t_coordinate = + offset->i32[1]; + + if (instr->coord_components >= 3) + p0_unpacked.texel_offset_for_r_coordinate = + offset->i32[2]; + break; + } + + default: + unreachable("unknown texture source"); + } + } + + uint32_t p0_packed; + V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL, + (uint8_t *)&p0_packed, + &p0_unpacked); + + /* There is no native support for GL texture rectangle coordinates, so + * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, + * 1]). + */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { + coords[0] = vir_FMUL(c, coords[0], + vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, + unit)); + coords[1] = vir_FMUL(c, coords[1], + vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, + unit)); + } + + struct qreg texture_u[] = { + vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed), + vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit), + }; + uint32_t next_texture_u = 0; + + for (int i = 0; i < next_coord; i++) { + struct qreg dst; + + if (i == next_coord - 1) + dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL); + else + dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU); + + struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]); + + if (i < 2) { + tmu->has_implicit_uniform = true; + tmu->src[vir_get_implicit_uniform_src(tmu)] = + texture_u[next_texture_u++]; + } + } + + bool return_16 = (c->key->tex[unit].return_size == 16 || + p0_unpacked.shadow); + + struct qreg return_values[4]; + for (int i = 0; i < c->key->tex[unit].return_channels; i++) + return_values[i] = vir_LDTMU(c); + /* Swizzling .zw of an RG texture should give undefined results, not + * crash the compiler. + */ + for (int i = c->key->tex[unit].return_channels; i < 4; i++) + return_values[i] = c->undef; + + for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) { + struct qreg chan; + + if (return_16) { + STATIC_ASSERT(PIPE_SWIZZLE_X == 0); + chan = return_values[i / 2]; + + enum v3d_qpu_input_unpack unpack; + if (i & 1) + unpack = V3D_QPU_UNPACK_H; + else + unpack = V3D_QPU_UNPACK_L; + + chan = vir_FMOV(c, chan); + vir_set_unpack(c->defs[chan.index], 0, unpack); + } else { + chan = vir_MOV(c, return_values[i]); + } + ntq_store_dest(c, &instr->dest, i, chan); + } +} + +static struct qreg +ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) +{ + struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); + if (is_cos) + input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); + + struct qreg periods = vir_FROUND(c, input); + struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN, + vir_FSUB(c, input, periods)); + return vir_XOR(c, sin_output, vir_SHL(c, + vir_FTOIN(c, periods), + vir_uniform_ui(c, -1))); +} + +static struct qreg +ntq_fsign(struct v3d_compile *c, struct qreg src) +{ + struct qreg t = vir_get_temp(c); + + vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); + vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); + vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN); + vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); + return vir_MOV(c, t); +} + +static struct qreg +ntq_isign(struct v3d_compile *c, struct qreg src) +{ + struct qreg t = vir_get_temp(c); + + vir_MOV_dest(c, t, vir_uniform_ui(c, 0)); + vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1)); + vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN); + vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1)); + return vir_MOV(c, t); +} + +static void +emit_fragcoord_input(struct v3d_compile *c, int attr) +{ + c->inputs[attr * 4 + 0] = vir_FXCD(c); + c->inputs[attr * 4 + 1] = vir_FYCD(c); + c->inputs[attr * 4 + 2] = c->payload_z; + c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP, + c->payload_w); +} + +static struct qreg +emit_fragment_varying(struct v3d_compile *c, nir_variable *var, + uint8_t swizzle) +{ + struct qreg vary = vir_reg(QFILE_VARY, ~0); + struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + + /* For gl_PointCoord input or distance along a line, we'll be called + * with no nir_variable, and we don't count toward VPM size so we + * don't track an input slot. + */ + if (!var) { + return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); + } + + int i = c->num_inputs++; + c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location, + swizzle); + + switch (var->data.interpolation) { + case INTERP_MODE_NONE: + case INTERP_MODE_SMOOTH: + if (var->data.centroid) { + return vir_FADD(c, vir_FMUL(c, vary, + c->payload_w_centroid), r5); + } else { + return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); + } + case INTERP_MODE_NOPERSPECTIVE: + /* C appears after the mov from the varying. + XXX: improve ldvary setup. + */ + return vir_FADD(c, vir_MOV(c, vary), r5); + case INTERP_MODE_FLAT: + BITSET_SET(c->flat_shade_flags, i); + vir_MOV_dest(c, c->undef, vary); + return vir_MOV(c, r5); + default: + unreachable("Bad interp mode"); + } +} + +static void +emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var) +{ + for (int i = 0; i < glsl_get_vector_elements(var->type); i++) { + c->inputs[attr * 4 + i] = + emit_fragment_varying(c, var, i); + } +} + +static void +add_output(struct v3d_compile *c, + uint32_t decl_offset, + uint8_t slot, + uint8_t swizzle) +{ + uint32_t old_array_size = c->outputs_array_size; + resize_qreg_array(c, &c->outputs, &c->outputs_array_size, + decl_offset + 1); + + if (old_array_size != c->outputs_array_size) { + c->output_slots = reralloc(c, + c->output_slots, + struct v3d_varying_slot, + c->outputs_array_size); + } + + c->output_slots[decl_offset] = + v3d_slot_from_slot_and_component(slot, swizzle); +} + +static void +declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size) +{ + unsigned array_id = c->num_ubo_ranges++; + if (array_id >= c->ubo_ranges_array_size) { + c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, + array_id + 1); + c->ubo_ranges = reralloc(c, c->ubo_ranges, + struct v3d_ubo_range, + c->ubo_ranges_array_size); + c->ubo_range_used = reralloc(c, c->ubo_range_used, + bool, + c->ubo_ranges_array_size); + } + + c->ubo_ranges[array_id].dst_offset = 0; + c->ubo_ranges[array_id].src_offset = start; + c->ubo_ranges[array_id].size = size; + c->ubo_range_used[array_id] = false; +} + +/** + * If compare_instr is a valid comparison instruction, emits the + * compare_instr's comparison and returns the sel_instr's return value based + * on the compare_instr's result. + */ +static bool +ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest, + nir_alu_instr *compare_instr, + nir_alu_instr *sel_instr) +{ + struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); + struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); + bool cond_invert = false; + + switch (compare_instr->op) { + case nir_op_feq: + case nir_op_seq: + vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ); + break; + case nir_op_ieq: + vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ); + break; + + case nir_op_fne: + case nir_op_sne: + vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ); + cond_invert = true; + break; + case nir_op_ine: + vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ); + cond_invert = true; + break; + + case nir_op_fge: + case nir_op_sge: + vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC); + break; + case nir_op_ige: + vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC); + cond_invert = true; + break; + case nir_op_uge: + vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC); + cond_invert = true; + break; + + case nir_op_slt: + case nir_op_flt: + vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN); + break; + case nir_op_ilt: + vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC); + break; + case nir_op_ult: + vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC); + break; + + default: + return false; + } + + enum v3d_qpu_cond cond = (cond_invert ? + V3D_QPU_COND_IFNA : + V3D_QPU_COND_IFA); + + switch (sel_instr->op) { + case nir_op_seq: + case nir_op_sne: + case nir_op_sge: + case nir_op_slt: + *dest = vir_SEL(c, cond, + vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0)); + break; + + case nir_op_bcsel: + *dest = vir_SEL(c, cond, + ntq_get_alu_src(c, sel_instr, 1), + ntq_get_alu_src(c, sel_instr, 2)); + break; + + default: + *dest = vir_SEL(c, cond, + vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0)); + break; + } + + /* Make the temporary for nir_store_dest(). */ + *dest = vir_MOV(c, *dest); + + return true; +} + +/** + * Attempts to fold a comparison generating a boolean result into the + * condition code for selecting between two values, instead of comparing the + * boolean result against 0 to generate the condition code. + */ +static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr, + struct qreg *src) +{ + if (!instr->src[0].src.is_ssa) + goto out; + if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + goto out; + nir_alu_instr *compare = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + if (!compare) + goto out; + + struct qreg dest; + if (ntq_emit_comparison(c, &dest, compare, instr)) + return dest; + +out: + vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2])); +} + + +static void +ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) +{ + /* This should always be lowered to ALU operations for V3D. */ + assert(!instr->dest.saturate); + + /* Vectors are special in that they have non-scalarized writemasks, + * and just take the first swizzle channel for each argument in order + * into each writemask channel. + */ + if (instr->op == nir_op_vec2 || + instr->op == nir_op_vec3 || + instr->op == nir_op_vec4) { + struct qreg srcs[4]; + for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + srcs[i] = ntq_get_src(c, instr->src[i].src, + instr->src[i].swizzle[0]); + for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + ntq_store_dest(c, &instr->dest.dest, i, + vir_MOV(c, srcs[i])); + return; + } + + /* General case: We can just grab the one used channel per src. */ + struct qreg src[nir_op_infos[instr->op].num_inputs]; + for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + src[i] = ntq_get_alu_src(c, instr, i); + } + + struct qreg result; + + switch (instr->op) { + case nir_op_fmov: + case nir_op_imov: + result = vir_MOV(c, src[0]); + break; + case nir_op_fmul: + result = vir_FMUL(c, src[0], src[1]); + break; + case nir_op_fadd: + result = vir_FADD(c, src[0], src[1]); + break; + case nir_op_fsub: + result = vir_FSUB(c, src[0], src[1]); + break; + case nir_op_fmin: + result = vir_FMIN(c, src[0], src[1]); + break; + case nir_op_fmax: + result = vir_FMAX(c, src[0], src[1]); + break; + + case nir_op_f2i32: + result = vir_FTOIZ(c, src[0]); + break; + case nir_op_f2u32: + result = vir_FTOUZ(c, src[0]); + break; + case nir_op_i2f32: + result = vir_ITOF(c, src[0]); + break; + case nir_op_u2f32: + result = vir_UTOF(c, src[0]); + break; + case nir_op_b2f: + result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); + break; + case nir_op_b2i: + result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); + break; + case nir_op_i2b: + case nir_op_f2b: + vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, + vir_uniform_ui(c, ~0), + vir_uniform_ui(c, 0))); + break; + + case nir_op_iadd: + result = vir_ADD(c, src[0], src[1]); + break; + case nir_op_ushr: + result = vir_SHR(c, src[0], src[1]); + break; + case nir_op_isub: + result = vir_SUB(c, src[0], src[1]); + break; + case nir_op_ishr: + result = vir_ASR(c, src[0], src[1]); + break; + case nir_op_ishl: + result = vir_SHL(c, src[0], src[1]); + break; + case nir_op_imin: + result = vir_MIN(c, src[0], src[1]); + break; + case nir_op_umin: + result = vir_UMIN(c, src[0], src[1]); + break; + case nir_op_imax: + result = vir_MAX(c, src[0], src[1]); + break; + case nir_op_umax: + result = vir_UMAX(c, src[0], src[1]); + break; + case nir_op_iand: + result = vir_AND(c, src[0], src[1]); + break; + case nir_op_ior: + result = vir_OR(c, src[0], src[1]); + break; + case nir_op_ixor: + result = vir_XOR(c, src[0], src[1]); + break; + case nir_op_inot: + result = vir_NOT(c, src[0]); + break; + + case nir_op_imul: + result = ntq_umul(c, src[0], src[1]); + break; + + case nir_op_seq: + case nir_op_sne: + case nir_op_sge: + case nir_op_slt: + case nir_op_feq: + case nir_op_fne: + case nir_op_fge: + case nir_op_flt: + case nir_op_ieq: + case nir_op_ine: + case nir_op_ige: + case nir_op_uge: + case nir_op_ilt: + case nir_op_ult: + if (!ntq_emit_comparison(c, &result, instr, instr)) { + fprintf(stderr, "Bad comparison instruction\n"); + } + break; + + case nir_op_bcsel: + result = ntq_emit_bcsel(c, instr, src); + break; + case nir_op_fcsel: + vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, + src[1], src[2])); + break; + + case nir_op_frcp: + result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]); + break; + case nir_op_frsq: + result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]); + break; + case nir_op_fexp2: + result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]); + break; + case nir_op_flog2: + result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]); + break; + + case nir_op_fceil: + result = vir_FCEIL(c, src[0]); + break; + case nir_op_ffloor: + result = vir_FFLOOR(c, src[0]); + break; + case nir_op_fround_even: + result = vir_FROUND(c, src[0]); + break; + case nir_op_ftrunc: + result = vir_FTRUNC(c, src[0]); + break; + case nir_op_ffract: + result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0])); + break; + + case nir_op_fsin: + result = ntq_fsincos(c, src[0], false); + break; + case nir_op_fcos: + result = ntq_fsincos(c, src[0], true); + break; + + case nir_op_fsign: + result = ntq_fsign(c, src[0]); + break; + case nir_op_isign: + result = ntq_isign(c, src[0]); + break; + + case nir_op_fabs: { + result = vir_FMOV(c, src[0]); + vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); + break; + } + + case nir_op_iabs: + result = vir_MAX(c, src[0], + vir_SUB(c, vir_uniform_ui(c, 0), src[0])); + break; + + case nir_op_fddx: + case nir_op_fddx_coarse: + case nir_op_fddx_fine: + result = vir_FDX(c, src[0]); + break; + + case nir_op_fddy: + case nir_op_fddy_coarse: + case nir_op_fddy_fine: + result = vir_FDY(c, src[0]); + break; + + default: + fprintf(stderr, "unknown NIR ALU inst: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + /* We have a scalar result, so the instruction should only have a + * single channel written to. + */ + assert(util_is_power_of_two(instr->dest.write_mask)); + ntq_store_dest(c, &instr->dest.dest, + ffs(instr->dest.write_mask) - 1, result); +} + +static void +emit_frag_end(struct v3d_compile *c) +{ + uint32_t discard_cond = V3D_QPU_COND_NONE; + if (c->s->info.fs.uses_discard) { + vir_PF(c, vir_MOV(c, c->discard), V3D_QPU_PF_PUSHZ); + discard_cond = V3D_QPU_COND_IFA; + } + + /* XXX + if (c->output_sample_mask_index != -1) { + vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); + } + */ + + if (c->output_position_index != -1) { + struct qinst *inst = vir_MOV_dest(c, + vir_reg(QFILE_TLBU, 0), + c->outputs[c->output_position_index]); + + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, + (1 << 2) | /* per pixel */ + (2 << 6) /* type */ | + 0xffffff00); + } + + /* XXX: Performance improvement: Merge Z write and color writes TLB + * uniform setup + */ + + if (c->output_color_var) { + nir_variable *var = c->output_color_var; + struct qreg *color = &c->outputs[var->data.driver_location * 4]; + int num_components = glsl_get_vector_elements(var->type); + uint32_t conf = ~0; + struct qinst *inst; + + assert(num_components != 0); + switch (glsl_get_base_type(var->type)) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + conf = ((1 << 2) | /* per pixel */ + ((7 - 0) << 3) | /* rt */ + (1 << 6) /* type */ | + (num_components - 1) | + 0xffffff00); + + + inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]); + vir_set_cond(inst, discard_cond); + inst->src[vir_get_implicit_uniform_src(inst)] = + vir_uniform_ui(c, conf); + + for (int i = 1; i < num_components; i++) { + inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), + color[i]); + vir_set_cond(inst, discard_cond); + } + break; + + default: { + struct qreg r = color[0]; + struct qreg g = color[1]; + struct qreg b = color[2]; + struct qreg a = color[3]; + + if (c->fs_key->swap_color_rb) { + r = color[2]; + b = color[0]; + } + + inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g); + vir_set_cond(inst, discard_cond); + inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a); + vir_set_cond(inst, discard_cond); + break; + } + } + } +} + +static void +emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w) +{ + for (int i = 0; i < 2; i++) { + struct qreg coord = c->outputs[c->output_position_index + i]; + coord = vir_FMUL(c, coord, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, + 0)); + coord = vir_FMUL(c, coord, rcp_w); + vir_FTOIN_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), + coord); + } + +} + +static void +emit_zs_write(struct v3d_compile *c, struct qreg rcp_w) +{ + struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); + struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); + + vir_FADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), + vir_FMUL(c, vir_FMUL(c, + c->outputs[c->output_position_index + 2], + zscale), + rcp_w), + zoffset); +} + +static void +emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w) +{ + vir_VPM_WRITE(c, rcp_w); +} + +static void +emit_point_size_write(struct v3d_compile *c) +{ + struct qreg point_size; + + if (c->output_point_size_index != -1) + point_size = c->outputs[c->output_point_size_index]; + else + point_size = vir_uniform_f(c, 1.0); + + /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, + * BCM21553). + */ + point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125)); + + vir_VPM_WRITE(c, point_size); +} + +static void +emit_vpm_write_setup(struct v3d_compile *c) +{ + uint32_t packed; + struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = { + V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header, + + .horiz = true, + .laned = false, + .segs = true, + .stride = 1, + .size = VPM_SETUP_SIZE_32_BIT, + .addr = 0, + }; + + V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL, + (uint8_t *)&packed, + &unpacked); + vir_VPMSETUP(c, vir_uniform_ui(c, packed)); +} + +static void +emit_vert_end(struct v3d_compile *c) +{ + struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP, + c->outputs[c->output_position_index + 3]); + + emit_vpm_write_setup(c); + + if (c->vs_key->is_coord) { + for (int i = 0; i < 4; i++) + vir_VPM_WRITE(c, c->outputs[c->output_position_index + i]); + emit_scaled_viewport_write(c, rcp_w); + if (c->vs_key->per_vertex_point_size) { + emit_point_size_write(c); + /* emit_rcp_wc_write(c, rcp_w); */ + } + /* XXX: Z-only rendering */ + if (0) + emit_zs_write(c, rcp_w); + } else { + emit_scaled_viewport_write(c, rcp_w); + emit_zs_write(c, rcp_w); + emit_rcp_wc_write(c, rcp_w); + if (c->vs_key->per_vertex_point_size) + emit_point_size_write(c); + } + + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + struct v3d_varying_slot input = c->vs_key->fs_inputs[i]; + int j; + + for (j = 0; j < c->num_outputs; j++) { + struct v3d_varying_slot output = c->output_slots[j]; + + if (!memcmp(&input, &output, sizeof(input))) { + vir_VPM_WRITE(c, c->outputs[j]); + break; + } + } + /* Emit padding if we didn't find a declared VS output for + * this FS input. + */ + if (j == c->num_outputs) + vir_VPM_WRITE(c, vir_uniform_f(c, 0.0)); + } +} + +void +v3d_optimize_nir(struct nir_shader *s) +{ + bool progress; + + do { + progress = false; + + NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_lower_alu_to_scalar); + NIR_PASS(progress, s, nir_lower_phis_to_scalar); + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_dead_cf); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_peephole_select, 8); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_undef); + } while (progress); +} + +static int +driver_location_compare(const void *in_a, const void *in_b) +{ + const nir_variable *const *a = in_a; + const nir_variable *const *b = in_b; + + return (*a)->data.driver_location - (*b)->data.driver_location; +} + +static struct qreg +ntq_emit_vpm_read(struct v3d_compile *c, + uint32_t *num_components_queued, + uint32_t *remaining, + uint32_t vpm_index) +{ + struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); + + if (*num_components_queued != 0) { + (*num_components_queued)--; + c->num_inputs++; + return vir_MOV(c, vpm); + } + + uint32_t num_components = MIN2(*remaining, 32); + + struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = { + V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header, + + .horiz = true, + .laned = false, + /* If the field is 0, that means a read count of 32. */ + .num = num_components & 31, + .segs = true, + .stride = 1, + .size = VPM_SETUP_SIZE_32_BIT, + .addr = c->num_inputs, + }; + + uint32_t packed; + V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL, + (uint8_t *)&packed, + &unpacked); + vir_VPMSETUP(c, vir_uniform_ui(c, packed)); + + *num_components_queued = num_components - 1; + *remaining -= num_components; + c->num_inputs++; + + return vir_MOV(c, vpm); +} + +static void +ntq_setup_inputs(struct v3d_compile *c) +{ + unsigned num_entries = 0; + unsigned num_components = 0; + nir_foreach_variable(var, &c->s->inputs) { + num_entries++; + num_components += glsl_get_components(var->type); + } + + nir_variable *vars[num_entries]; + + unsigned i = 0; + nir_foreach_variable(var, &c->s->inputs) + vars[i++] = var; + + /* Sort the variables so that we emit the input setup in + * driver_location order. This is required for VPM reads, whose data + * is fetched into the VPM in driver_location (TGSI register index) + * order. + */ + qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + + uint32_t vpm_components_queued = 0; + if (c->s->stage == MESA_SHADER_VERTEX) { + bool uses_iid = c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_INSTANCE_ID); + bool uses_vid = c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_VERTEX_ID); + + num_components += uses_iid; + num_components += uses_vid; + + if (uses_iid) { + c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, + &num_components, ~0); + } + + if (uses_vid) { + c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, + &num_components, ~0); + } + } + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + unsigned loc = var->data.driver_location; + + assert(array_len == 1); + (void)array_len; + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + 1) * 4); + + if (c->s->stage == MESA_SHADER_FRAGMENT) { + if (var->data.location == VARYING_SLOT_POS) { + emit_fragcoord_input(c, loc); + } else if (var->data.location == VARYING_SLOT_PNTC || + (var->data.location >= VARYING_SLOT_VAR0 && + (c->fs_key->point_sprite_mask & + (1 << (var->data.location - + VARYING_SLOT_VAR0))))) { + c->inputs[loc * 4 + 0] = c->point_x; + c->inputs[loc * 4 + 1] = c->point_y; + } else { + emit_fragment_input(c, loc, var); + } + } else { + int var_components = glsl_get_components(var->type); + + for (int i = 0; i < var_components; i++) { + c->inputs[loc * 4 + i] = + ntq_emit_vpm_read(c, + &vpm_components_queued, + &num_components, + loc * 4 + i); + + } + c->vattr_sizes[loc] = var_components; + } + } + + if (c->s->stage == MESA_SHADER_VERTEX) { + assert(vpm_components_queued == 0); + assert(num_components == 0); + } +} + +static void +ntq_setup_outputs(struct v3d_compile *c) +{ + nir_foreach_variable(var, &c->s->outputs) { + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + unsigned loc = var->data.driver_location * 4; + + assert(array_len == 1); + (void)array_len; + + for (int i = 0; i < 4; i++) + add_output(c, loc + i, var->data.location, i); + + if (c->s->stage == MESA_SHADER_FRAGMENT) { + switch (var->data.location) { + case FRAG_RESULT_COLOR: + case FRAG_RESULT_DATA0: + c->output_color_var = var; + break; + case FRAG_RESULT_DEPTH: + c->output_position_index = loc; + break; + case FRAG_RESULT_SAMPLE_MASK: + c->output_sample_mask_index = loc; + break; + } + } else { + switch (var->data.location) { + case VARYING_SLOT_POS: + c->output_position_index = loc; + break; + case VARYING_SLOT_PSIZ: + c->output_point_size_index = loc; + break; + } + } + } +} + +static void +ntq_setup_uniforms(struct v3d_compile *c) +{ + nir_foreach_variable(var, &c->s->uniforms) { + uint32_t vec4_count = glsl_count_attribute_slots(var->type, + false); + unsigned vec4_size = 4 * sizeof(float); + + declare_uniform_range(c, var->data.driver_location * vec4_size, + vec4_count * vec4_size); + + } +} + +/** + * Sets up the mapping from nir_register to struct qreg *. + * + * Each nir_register gets a struct qreg per 32-bit component being stored. + */ +static void +ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_register, nir_reg, node, list) { + unsigned array_len = MAX2(nir_reg->num_array_elems, 1); + struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, + array_len * + nir_reg->num_components); + + _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); + + for (int i = 0; i < array_len * nir_reg->num_components; i++) + qregs[i] = vir_get_temp(c); + } +} + +static void +ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) +{ + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); + for (int i = 0; i < instr->def.num_components; i++) + qregs[i] = vir_uniform_ui(c, instr->value.u32[i]); + + _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); +} + +static void +ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) +{ + struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); + + /* VIR needs there to be *some* value, so pick 0 (same as for + * ntq_setup_registers(). + */ + for (int i = 0; i < instr->def.num_components; i++) + qregs[i] = vir_uniform_ui(c, 0); +} + +static void +ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + nir_const_value *const_offset; + unsigned offset; + + switch (instr->intrinsic) { + case nir_intrinsic_load_uniform: + assert(instr->num_components == 1); + const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + assert(offset % 4 == 0); + /* We need dwords */ + offset = offset / 4; + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_UNIFORM, + offset)); + } else { + ntq_store_dest(c, &instr->dest, 0, + indirect_uniform_load(c, instr)); + } + break; + + case nir_intrinsic_load_ubo: + for (int i = 0; i < instr->num_components; i++) { + int ubo = nir_src_as_const_value(instr->src[0])->u32[0]; + + /* Adjust for where we stored the TGSI register base. */ + vir_ADD_dest(c, + vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), + vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo), + vir_ADD(c, + ntq_get_src(c, instr->src[1], 0), + vir_uniform_ui(c, i * 4))); + + ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); + } + break; + + const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + assert(offset % 4 == 0); + /* We need dwords */ + offset = offset / 4; + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_UNIFORM, + offset)); + } else { + ntq_store_dest(c, &instr->dest, 0, + indirect_uniform_load(c, instr)); + } + break; + + case nir_intrinsic_load_user_clip_plane: + for (int i = 0; i < instr->num_components; i++) { + ntq_store_dest(c, &instr->dest, i, + vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + nir_intrinsic_ucp_id(instr) * + 4 + i)); + } + break; + + case nir_intrinsic_load_alpha_ref_float: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); + break; + + case nir_intrinsic_load_sample_mask_in: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); + break; + + case nir_intrinsic_load_front_face: + /* The register contains 0 (front) or 1 (back), and we need to + * turn it into a NIR bool where true means front. + */ + ntq_store_dest(c, &instr->dest, 0, + vir_ADD(c, + vir_uniform_ui(c, -1), + vir_REVF(c))); + break; + + case nir_intrinsic_load_instance_id: + ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); + break; + + case nir_intrinsic_load_vertex_id: + ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); + break; + + case nir_intrinsic_load_input: + const_offset = nir_src_as_const_value(instr->src[0]); + assert(const_offset && "v3d doesn't support indirect inputs"); + for (int i = 0; i < instr->num_components; i++) { + offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + int comp = nir_intrinsic_component(instr) + i; + ntq_store_dest(c, &instr->dest, i, + vir_MOV(c, c->inputs[offset * 4 + comp])); + } + break; + + case nir_intrinsic_store_output: + const_offset = nir_src_as_const_value(instr->src[1]); + assert(const_offset && "v3d doesn't support indirect outputs"); + offset = ((nir_intrinsic_base(instr) + + const_offset->u32[0]) * 4 + + nir_intrinsic_component(instr)); + + for (int i = 0; i < instr->num_components; i++) { + c->outputs[offset + i] = + vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + } + c->num_outputs = MAX2(c->num_outputs, + offset + instr->num_components); + break; + + case nir_intrinsic_discard: + if (c->execute.file != QFILE_NULL) { + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->discard, + vir_uniform_ui(c, ~0)); + } else { + vir_MOV_dest(c, c->discard, vir_uniform_ui(c, ~0)); + } + break; + + case nir_intrinsic_discard_if: { + /* true (~0) if we're discarding */ + struct qreg cond = ntq_get_src(c, instr->src[0], 0); + + if (c->execute.file != QFILE_NULL) { + /* execute == 0 means the channel is active. Invert + * the condition so that we can use zero as "executing + * and discarding." + */ + vir_PF(c, vir_AND(c, c->execute, vir_NOT(c, cond)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->discard, cond); + } else { + vir_OR_dest(c, c->discard, c->discard, cond); + } + + break; + } + + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } +} + +/* Clears (activates) the execute flags for any channels whose jump target + * matches this block. + */ +static void +ntq_activate_execute_for_block(struct v3d_compile *c) +{ + vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)), + V3D_QPU_PF_PUSHZ); + + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); +} + +static void +ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt) +{ + nir_block *nir_else_block = nir_if_first_else_block(if_stmt); + bool empty_else_block = + (nir_else_block == nir_if_last_else_block(if_stmt) && + exec_list_is_empty(&nir_else_block->instr_list)); + + struct qblock *then_block = vir_new_block(c); + struct qblock *after_block = vir_new_block(c); + struct qblock *else_block; + if (empty_else_block) + else_block = after_block; + else + else_block = vir_new_block(c); + + bool was_top_level = false; + if (c->execute.file == QFILE_NULL) { + c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); + was_top_level = true; + } + + /* Set A for executing (execute == 0) and jumping (if->condition == + * 0) channels, and then update execute flags for those to point to + * the ELSE block. + */ + vir_PF(c, vir_OR(c, + c->execute, + ntq_get_src(c, if_stmt->condition, 0)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, + c->execute, + vir_uniform_ui(c, else_block->index)); + + /* Jump to ELSE if nothing is active for THEN, otherwise fall + * through. + */ + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); + vir_link_blocks(c->cur_block, else_block); + vir_link_blocks(c->cur_block, then_block); + + /* Process the THEN block. */ + vir_set_emit_block(c, then_block); + ntq_emit_cf_list(c, &if_stmt->then_list); + + if (!empty_else_block) { + /* Handle the end of the THEN block. First, all currently + * active channels update their execute flags to point to + * ENDIF + */ + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, + vir_uniform_ui(c, after_block->index)); + + /* If everything points at ENDIF, then jump there immediately. */ + vir_PF(c, vir_SUB(c, c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); + vir_link_blocks(c->cur_block, after_block); + vir_link_blocks(c->cur_block, else_block); + + vir_set_emit_block(c, else_block); + ntq_activate_execute_for_block(c); + ntq_emit_cf_list(c, &if_stmt->else_list); + } + + vir_link_blocks(c->cur_block, after_block); + + vir_set_emit_block(c, after_block); + if (was_top_level) + c->execute = c->undef; + else + ntq_activate_execute_for_block(c); +} + +static void +ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, + vir_uniform_ui(c, c->loop_break_block->index)); + break; + + case nir_jump_continue: + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, + vir_uniform_ui(c, c->loop_cont_block->index)); + break; + + case nir_jump_return: + unreachable("All returns shouold be lowered\n"); + } +} + +static void +ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + ntq_emit_alu(c, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_load_const: + ntq_emit_load_const(c, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_ssa_undef: + ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); + break; + + case nir_instr_type_tex: + ntq_emit_tex(c, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_jump: + ntq_emit_jump(c, nir_instr_as_jump(instr)); + break; + + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static void +ntq_emit_block(struct v3d_compile *c, nir_block *block) +{ + nir_foreach_instr(instr, block) { + ntq_emit_instr(c, instr); + } +} + +static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); + +static void +ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) +{ + bool was_top_level = false; + if (c->execute.file == QFILE_NULL) { + c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); + was_top_level = true; + } + + struct qblock *save_loop_cont_block = c->loop_cont_block; + struct qblock *save_loop_break_block = c->loop_break_block; + + c->loop_cont_block = vir_new_block(c); + c->loop_break_block = vir_new_block(c); + + vir_link_blocks(c->cur_block, c->loop_cont_block); + vir_set_emit_block(c, c->loop_cont_block); + ntq_activate_execute_for_block(c); + + ntq_emit_cf_list(c, &loop->body); + + /* Re-enable any previous continues now, so our ANYA check below + * works. + * + * XXX: Use the .ORZ flags update, instead. + */ + vir_PF(c, vir_SUB(c, + c->execute, + vir_uniform_ui(c, c->loop_cont_block->index)), + V3D_QPU_PF_PUSHZ); + vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); + + vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + + vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); + vir_link_blocks(c->cur_block, c->loop_cont_block); + vir_link_blocks(c->cur_block, c->loop_break_block); + + vir_set_emit_block(c, c->loop_break_block); + if (was_top_level) + c->execute = c->undef; + else + ntq_activate_execute_for_block(c); + + c->loop_break_block = save_loop_break_block; + c->loop_cont_block = save_loop_cont_block; +} + +static void +ntq_emit_function(struct v3d_compile *c, nir_function_impl *func) +{ + fprintf(stderr, "FUNCTIONS not handled.\n"); + abort(); +} + +static void +ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + ntq_emit_block(c, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + ntq_emit_if(c, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + ntq_emit_loop(c, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_function: + ntq_emit_function(c, nir_cf_node_as_function(node)); + break; + + default: + fprintf(stderr, "Unknown NIR node type\n"); + abort(); + } + } +} + +static void +ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) +{ + ntq_setup_registers(c, &impl->registers); + ntq_emit_cf_list(c, &impl->body); +} + +static void +nir_to_vir(struct v3d_compile *c) +{ + if (c->s->stage == MESA_SHADER_FRAGMENT) { + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); + c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); + + if (c->s->info.fs.uses_discard) + c->discard = vir_MOV(c, vir_uniform_ui(c, 0)); + + if (c->fs_key->is_points) { + c->point_x = emit_fragment_varying(c, NULL, 0); + c->point_y = emit_fragment_varying(c, NULL, 0); + } else if (c->fs_key->is_lines) { + c->line_x = emit_fragment_varying(c, NULL, 0); + } + } + + ntq_setup_inputs(c); + ntq_setup_outputs(c); + ntq_setup_uniforms(c); + ntq_setup_registers(c, &c->s->registers); + + /* Find the main function and emit the body. */ + nir_foreach_function(function, c->s) { + assert(strcmp(function->name, "main") == 0); + assert(function->impl); + ntq_emit_impl(c, function->impl); + } +} + +const nir_shader_compiler_options v3d_nir_options = { + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_bitfield_insert = true, + .lower_bitfield_extract = true, + .lower_ffma = true, + .lower_flrp32 = true, + .lower_fpow = true, + .lower_fsat = true, + .lower_fsqrt = true, + .lower_negate = true, + .native_integers = true, +}; + + +#if 0 +static int +count_nir_instrs(nir_shader *nir) +{ + int count = 0; + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) + count++; + } + } + return count; +} +#endif + +void +v3d_nir_to_vir(struct v3d_compile *c) +{ + if (V3D_DEBUG & (V3D_DEBUG_NIR | + v3d_debug_flag_for_shader_stage(c->s->stage))) { + fprintf(stderr, "%s prog %d/%d NIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + nir_print_shader(c->s, stderr); + } + + nir_to_vir(c); + + switch (c->s->stage) { + case MESA_SHADER_FRAGMENT: + emit_frag_end(c); + break; + case MESA_SHADER_VERTEX: + emit_vert_end(c); + break; + default: + unreachable("bad stage"); + } + + if (V3D_DEBUG & (V3D_DEBUG_VIR | + v3d_debug_flag_for_shader_stage(c->s->stage))) { + fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + vir_dump(c); + fprintf(stderr, "\n"); + } + + vir_optimize(c); + vir_lower_uniforms(c); + + /* XXX: vir_schedule_instructions(c); */ + + if (V3D_DEBUG & (V3D_DEBUG_VIR | + v3d_debug_flag_for_shader_stage(c->s->stage))) { + fprintf(stderr, "%s prog %d/%d VIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + vir_dump(c); + fprintf(stderr, "\n"); + } + + v3d_vir_to_qpu(c); +} diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c new file mode 100644 index 00000000000..b5a0aa9a34a --- /dev/null +++ b/src/broadcom/compiler/qpu_schedule.c @@ -0,0 +1,1362 @@ +/* + * Copyright © 2010 Intel Corporation + * Copyright © 2014-2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file + * + * The basic model of the list scheduler is to take a basic block, compute a + * DAG of the dependencies, and make a list of the DAG heads. Heuristically + * pick a DAG head, then put all the children that are now DAG heads into the + * list of things to schedule. + * + * The goal of scheduling here is to pack pairs of operations together in a + * single QPU instruction. + */ + +#include "qpu/qpu_disasm.h" +#include "v3d_compiler.h" +#include "util/ralloc.h" + +static bool debug; + +struct schedule_node_child; + +struct schedule_node { + struct list_head link; + struct qinst *inst; + struct schedule_node_child *children; + uint32_t child_count; + uint32_t child_array_size; + uint32_t parent_count; + + /* Longest cycles + instruction_latency() of any parent of this node. */ + uint32_t unblocked_time; + + /** + * Minimum number of cycles from scheduling this instruction until the + * end of the program, based on the slowest dependency chain through + * the children. + */ + uint32_t delay; + + /** + * cycles between this instruction being scheduled and when its result + * can be consumed. + */ + uint32_t latency; +}; + +struct schedule_node_child { + struct schedule_node *node; + bool write_after_read; +}; + +/* When walking the instructions in reverse, we need to swap before/after in + * add_dep(). + */ +enum direction { F, R }; + +struct schedule_state { + struct schedule_node *last_r[6]; + struct schedule_node *last_rf[64]; + struct schedule_node *last_sf; + struct schedule_node *last_vpm_read; + struct schedule_node *last_tmu_write; + struct schedule_node *last_tlb; + struct schedule_node *last_vpm; + struct schedule_node *last_unif; + struct schedule_node *last_rtop; + enum direction dir; + /* Estimated cycle when the current instruction would start. */ + uint32_t time; +}; + +static void +add_dep(struct schedule_state *state, + struct schedule_node *before, + struct schedule_node *after, + bool write) +{ + bool write_after_read = !write && state->dir == R; + + if (!before || !after) + return; + + assert(before != after); + + if (state->dir == R) { + struct schedule_node *t = before; + before = after; + after = t; + } + + for (int i = 0; i < before->child_count; i++) { + if (before->children[i].node == after && + (before->children[i].write_after_read == write_after_read)) { + return; + } + } + + if (before->child_array_size <= before->child_count) { + before->child_array_size = MAX2(before->child_array_size * 2, 16); + before->children = reralloc(before, before->children, + struct schedule_node_child, + before->child_array_size); + } + + before->children[before->child_count].node = after; + before->children[before->child_count].write_after_read = + write_after_read; + before->child_count++; + after->parent_count++; +} + +static void +add_read_dep(struct schedule_state *state, + struct schedule_node *before, + struct schedule_node *after) +{ + add_dep(state, before, after, false); +} + +static void +add_write_dep(struct schedule_state *state, + struct schedule_node **before, + struct schedule_node *after) +{ + add_dep(state, *before, after, true); + *before = after; +} + +static bool +qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) +{ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (inst->alu.add.magic_write && + (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || + inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) + return true; + + if (inst->alu.mul.magic_write && + (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || + inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) + return true; + + return false; +} + +static void +process_mux_deps(struct schedule_state *state, struct schedule_node *n, + enum v3d_qpu_mux mux) +{ + switch (mux) { + case V3D_QPU_MUX_A: + add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); + break; + case V3D_QPU_MUX_B: + add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); + break; + default: + add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); + break; + } +} + + +static void +process_waddr_deps(struct schedule_state *state, struct schedule_node *n, + uint32_t waddr, bool magic) +{ + if (!magic) { + add_write_dep(state, &state->last_rf[waddr], n); + } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { + add_write_dep(state, &state->last_tmu_write, n); + } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { + /* Handled by v3d_qpu_writes_r4() check. */ + } else { + switch (waddr) { + case V3D_QPU_WADDR_R0: + case V3D_QPU_WADDR_R1: + case V3D_QPU_WADDR_R2: + case V3D_QPU_WADDR_R3: + case V3D_QPU_WADDR_R4: + case V3D_QPU_WADDR_R5: + add_write_dep(state, + &state->last_r[waddr - V3D_QPU_WADDR_R0], + n); + break; + + case V3D_QPU_WADDR_VPM: + case V3D_QPU_WADDR_VPMU: + add_write_dep(state, &state->last_vpm, n); + break; + + case V3D_QPU_WADDR_TLB: + case V3D_QPU_WADDR_TLBU: + add_write_dep(state, &state->last_tlb, n); + break; + + case V3D_QPU_WADDR_NOP: + break; + + default: + fprintf(stderr, "Unknown waddr %d\n", waddr); + abort(); + } + } +} + +static void +process_cond_deps(struct schedule_state *state, struct schedule_node *n, + enum v3d_qpu_cond cond) +{ + if (cond != V3D_QPU_COND_NONE) + add_read_dep(state, state->last_sf, n); +} + +static void +process_pf_deps(struct schedule_state *state, struct schedule_node *n, + enum v3d_qpu_pf pf) +{ + if (pf != V3D_QPU_PF_NONE) + add_write_dep(state, &state->last_sf, n); +} + +static void +process_uf_deps(struct schedule_state *state, struct schedule_node *n, + enum v3d_qpu_uf uf) +{ + if (uf != V3D_QPU_UF_NONE) + add_write_dep(state, &state->last_sf, n); +} + +/** + * Common code for dependencies that need to be tracked both forward and + * backward. + * + * This is for things like "all reads of r4 have to happen between the r4 + * writes that surround them". + */ +static void +calculate_deps(struct schedule_state *state, struct schedule_node *n) +{ + struct qinst *qinst = n->inst; + struct v3d_qpu_instr *inst = &qinst->qpu; + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { + if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) + add_read_dep(state, state->last_sf, n); + + /* XXX: BDI */ + /* XXX: BDU */ + /* XXX: ub */ + /* XXX: raddr_a */ + + add_write_dep(state, &state->last_unif, n); + return; + } + + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + + /* XXX: LOAD_IMM */ + + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) + process_mux_deps(state, n, inst->alu.add.a); + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) + process_mux_deps(state, n, inst->alu.add.b); + + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) + process_mux_deps(state, n, inst->alu.mul.a); + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) + process_mux_deps(state, n, inst->alu.mul.b); + + switch (inst->alu.add.op) { + case V3D_QPU_A_VPMSETUP: + /* Could distinguish read/write by unpacking the uniform. */ + add_write_dep(state, &state->last_vpm, n); + add_write_dep(state, &state->last_vpm_read, n); + break; + + case V3D_QPU_A_STVPMV: + case V3D_QPU_A_STVPMD: + case V3D_QPU_A_STVPMP: + add_write_dep(state, &state->last_vpm, n); + break; + + case V3D_QPU_A_MSF: + add_read_dep(state, state->last_tlb, n); + break; + + case V3D_QPU_A_SETMSF: + case V3D_QPU_A_SETREVF: + add_write_dep(state, &state->last_tlb, n); + break; + + case V3D_QPU_A_FLAPUSH: + case V3D_QPU_A_FLBPUSH: + case V3D_QPU_A_VFLA: + case V3D_QPU_A_VFLNA: + case V3D_QPU_A_VFLB: + case V3D_QPU_A_VFLNB: + add_read_dep(state, state->last_sf, n); + break; + + case V3D_QPU_A_FLBPOP: + add_write_dep(state, &state->last_sf, n); + break; + + default: + break; + } + + switch (inst->alu.mul.op) { + case V3D_QPU_M_MULTOP: + case V3D_QPU_M_UMUL24: + /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and + * resets it to 0. We could possibly reorder umul24s relative + * to each other, but for now just keep all the MUL parts in + * order. + */ + add_write_dep(state, &state->last_rtop, n); + break; + default: + break; + } + + if (inst->alu.add.op != V3D_QPU_A_NOP) { + process_waddr_deps(state, n, inst->alu.add.waddr, + inst->alu.add.magic_write); + } + if (inst->alu.mul.op != V3D_QPU_M_NOP) { + process_waddr_deps(state, n, inst->alu.mul.waddr, + inst->alu.mul.magic_write); + } + + if (v3d_qpu_writes_r3(inst)) + add_write_dep(state, &state->last_r[3], n); + if (v3d_qpu_writes_r4(inst)) + add_write_dep(state, &state->last_r[4], n); + if (v3d_qpu_writes_r5(inst)) + add_write_dep(state, &state->last_r[5], n); + + if (inst->sig.thrsw) { + /* All accumulator contents and flags are undefined after the + * switch. + */ + for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) + add_write_dep(state, &state->last_r[i], n); + add_write_dep(state, &state->last_sf, n); + + /* Scoreboard-locking operations have to stay after the last + * thread switch. + */ + add_write_dep(state, &state->last_tlb, n); + + add_write_dep(state, &state->last_tmu_write, n); + } + + if (inst->sig.ldtmu) { + /* TMU loads are coming from a FIFO, so ordering is important. + */ + add_write_dep(state, &state->last_tmu_write, n); + } + + if (inst->sig.ldtlb | inst->sig.ldtlbu) + add_read_dep(state, state->last_tlb, n); + + if (inst->sig.ldvpm) + add_write_dep(state, &state->last_vpm_read, n); + + /* inst->sig.ldunif or sideband uniform read */ + if (qinst->uniform != ~0) + add_write_dep(state, &state->last_unif, n); + + process_cond_deps(state, n, inst->flags.ac); + process_cond_deps(state, n, inst->flags.mc); + process_pf_deps(state, n, inst->flags.apf); + process_pf_deps(state, n, inst->flags.mpf); + process_uf_deps(state, n, inst->flags.auf); + process_uf_deps(state, n, inst->flags.muf); +} + +static void +calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) +{ + struct schedule_state state; + + memset(&state, 0, sizeof(state)); + state.dir = F; + + list_for_each_entry(struct schedule_node, node, schedule_list, link) + calculate_deps(&state, node); +} + +static void +calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list) +{ + struct list_head *node; + struct schedule_state state; + + memset(&state, 0, sizeof(state)); + state.dir = R; + + for (node = schedule_list->prev; schedule_list != node; node = node->prev) { + calculate_deps(&state, (struct schedule_node *)node); + } +} + +struct choose_scoreboard { + int tick; + int last_sfu_write_tick; + int last_ldvary_tick; + int last_uniforms_reset_tick; + uint32_t last_waddr_add, last_waddr_mul; + bool tlb_locked; +}; + +static bool +mux_reads_too_soon(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) +{ + switch (mux) { + case V3D_QPU_MUX_A: + if (scoreboard->last_waddr_add == inst->raddr_a || + scoreboard->last_waddr_mul == inst->raddr_a) { + return true; + } + break; + + case V3D_QPU_MUX_B: + if (scoreboard->last_waddr_add == inst->raddr_b || + scoreboard->last_waddr_mul == inst->raddr_b) { + return true; + } + break; + + case V3D_QPU_MUX_R4: + if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2) + return true; + break; + + case V3D_QPU_MUX_R5: + if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) + return true; + break; + default: + break; + } + + return false; +} + +static bool +reads_too_soon_after_write(struct choose_scoreboard *scoreboard, + struct qinst *qinst) +{ + const struct v3d_qpu_instr *inst = &qinst->qpu; + + /* XXX: Branching off of raddr. */ + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) + return false; + + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + + if (inst->alu.add.op != V3D_QPU_A_NOP) { + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && + mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { + return true; + } + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && + mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { + return true; + } + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP) { + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && + mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { + return true; + } + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && + mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { + return true; + } + } + + /* XXX: imm */ + + return false; +} + +static bool +writes_too_soon_after_write(struct choose_scoreboard *scoreboard, + struct qinst *qinst) +{ + const struct v3d_qpu_instr *inst = &qinst->qpu; + + /* Don't schedule any other r4 write too soon after an SFU write. + * This would normally be prevented by dependency tracking, but might + * occur if a dead SFU computation makes it to scheduling. + */ + if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 && + v3d_qpu_writes_r4(inst)) + return true; + + return false; +} + +static bool +pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); +} + +static int +get_instruction_priority(const struct v3d_qpu_instr *inst) +{ + uint32_t baseline_score; + uint32_t next_score = 0; + + /* Schedule TLB operations as late as possible, to get more + * parallelism between shaders. + */ + if (qpu_inst_is_tlb(inst)) + return next_score; + next_score++; + + /* Schedule texture read results collection late to hide latency. */ + if (inst->sig.ldtmu) + return next_score; + next_score++; + + /* Default score for things that aren't otherwise special. */ + baseline_score = next_score; + next_score++; + + /* Schedule texture read setup early to hide their latency better. */ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU && + ((inst->alu.add.magic_write && + v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) || + (inst->alu.mul.magic_write && + v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) { + return next_score; + } + next_score++; + + return baseline_score; +} + +static bool +qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) +{ + return (v3d_qpu_magic_waddr_is_tmu(waddr) || + v3d_qpu_magic_waddr_is_sfu(waddr) || + v3d_qpu_magic_waddr_is_tlb(waddr) || + v3d_qpu_magic_waddr_is_vpm(waddr) || + v3d_qpu_magic_waddr_is_tsy(waddr)); +} + +static bool +qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) +{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && + qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { + return true; + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && + qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { + return true; + } + } + + return (inst->sig.ldvpm || + inst->sig.ldtmu || + inst->sig.ldtlb || + inst->sig.ldtlbu); +} + +static bool +qpu_merge_inst(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *result, + const struct v3d_qpu_instr *a, + const struct v3d_qpu_instr *b) +{ + if (a->type != V3D_QPU_INSTR_TYPE_ALU || + b->type != V3D_QPU_INSTR_TYPE_ALU) { + return false; + } + + /* Can't do more than one peripheral access in an instruction. */ + if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) + return false; + + struct v3d_qpu_instr merge = *a; + + if (b->alu.add.op != V3D_QPU_A_NOP) { + if (a->alu.add.op != V3D_QPU_A_NOP) + return false; + merge.alu.add = b->alu.add; + + merge.flags.ac = b->flags.ac; + merge.flags.apf = b->flags.apf; + merge.flags.auf = b->flags.auf; + } + + if (b->alu.mul.op != V3D_QPU_M_NOP) { + if (a->alu.mul.op != V3D_QPU_M_NOP) + return false; + merge.alu.mul = b->alu.mul; + + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; + } + + if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) { + if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) && + a->raddr_a != b->raddr_a) { + return false; + } + merge.raddr_a = b->raddr_a; + } + + if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { + if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && + a->raddr_b != b->raddr_b) { + return false; + } + merge.raddr_b = b->raddr_b; + } + + merge.sig.thrsw |= b->sig.thrsw; + merge.sig.ldunif |= b->sig.ldunif; + merge.sig.ldtmu |= b->sig.ldtmu; + merge.sig.ldvary |= b->sig.ldvary; + merge.sig.ldvpm |= b->sig.ldvpm; + merge.sig.small_imm |= b->sig.small_imm; + merge.sig.ldtlb |= b->sig.ldtlb; + merge.sig.ldtlbu |= b->sig.ldtlbu; + merge.sig.ucb |= b->sig.ucb; + merge.sig.rotate |= b->sig.rotate; + merge.sig.wrtmuc |= b->sig.wrtmuc; + + uint64_t packed; + bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); + + *result = merge; + /* No modifying the real instructions on failure. */ + assert(ok || (a != result && b != result)); + + return ok; +} + +static struct schedule_node * +choose_instruction_to_schedule(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, + struct list_head *schedule_list, + struct schedule_node *prev_inst) +{ + struct schedule_node *chosen = NULL; + int chosen_prio = 0; + + /* Don't pair up anything with a thread switch signal -- emit_thrsw() + * will handle pairing it along with filling the delay slots. + */ + if (prev_inst) { + if (prev_inst->inst->qpu.sig.thrsw) + return NULL; + } + + list_for_each_entry(struct schedule_node, n, schedule_list, link) { + const struct v3d_qpu_instr *inst = &n->inst->qpu; + + /* Don't choose the branch instruction until it's the last one + * left. We'll move it up to fit its delay slots after we + * choose it. + */ + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && + !list_is_singular(schedule_list)) { + continue; + } + + /* "An instruction must not read from a location in physical + * regfile A or B that was written to by the previous + * instruction." + */ + if (reads_too_soon_after_write(scoreboard, n->inst)) + continue; + + if (writes_too_soon_after_write(scoreboard, n->inst)) + continue; + + /* "A scoreboard wait must not occur in the first two + * instructions of a fragment shader. This is either the + * explicit Wait for Scoreboard signal or an implicit wait + * with the first tile-buffer read or write instruction." + */ + if (pixel_scoreboard_too_soon(scoreboard, inst)) + continue; + + /* ldunif and ldvary both write r5, but ldunif does so a tick + * sooner. If the ldvary's r5 wasn't used, then ldunif might + * otherwise get scheduled so ldunif and ldvary try to update + * r5 in the same tick. + */ + if (inst->sig.ldunif && + scoreboard->tick == scoreboard->last_ldvary_tick + 1) { + continue; + } + + /* If we're trying to pair with another instruction, check + * that they're compatible. + */ + if (prev_inst) { + /* Don't pair up a thread switch signal -- we'll + * handle pairing it when we pick it on its own. + */ + if (inst->sig.thrsw) + continue; + + if (prev_inst->inst->uniform != -1 && + n->inst->uniform != -1) + continue; + + /* Don't merge in something that will lock the TLB. + * Hopwefully what we have in inst will release some + * other instructions, allowing us to delay the + * TLB-locking instruction until later. + */ + if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) + continue; + + struct v3d_qpu_instr merged_inst; + if (!qpu_merge_inst(devinfo, &merged_inst, + &prev_inst->inst->qpu, inst)) { + continue; + } + } + + int prio = get_instruction_priority(inst); + + /* Found a valid instruction. If nothing better comes along, + * this one works. + */ + if (!chosen) { + chosen = n; + chosen_prio = prio; + continue; + } + + if (prio > chosen_prio) { + chosen = n; + chosen_prio = prio; + } else if (prio < chosen_prio) { + continue; + } + + if (n->delay > chosen->delay) { + chosen = n; + chosen_prio = prio; + } else if (n->delay < chosen->delay) { + continue; + } + } + + return chosen; +} + +static void +update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, + enum v3d_qpu_waddr waddr) +{ + if (v3d_qpu_magic_waddr_is_sfu(waddr)) + scoreboard->last_sfu_write_tick = scoreboard->tick; +} + +static void +update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + scoreboard->last_waddr_add = ~0; + scoreboard->last_waddr_mul = ~0; + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) + return; + + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + + if (inst->alu.add.op != V3D_QPU_A_NOP) { + if (inst->alu.add.magic_write) { + update_scoreboard_for_magic_waddr(scoreboard, + inst->alu.add.waddr); + } else { + scoreboard->last_waddr_add = inst->alu.add.waddr; + } + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP) { + if (inst->alu.mul.magic_write) { + update_scoreboard_for_magic_waddr(scoreboard, + inst->alu.mul.waddr); + } else { + scoreboard->last_waddr_mul = inst->alu.mul.waddr; + } + } + + if (inst->sig.ldvary) + scoreboard->last_ldvary_tick = scoreboard->tick; + + if (qpu_inst_is_tlb(inst)) + scoreboard->tlb_locked = true; +} + +static void +dump_state(const struct v3d_device_info *devinfo, + struct list_head *schedule_list) +{ + list_for_each_entry(struct schedule_node, n, schedule_list, link) { + fprintf(stderr, " t=%4d: ", n->unblocked_time); + v3d_qpu_dump(devinfo, &n->inst->qpu); + fprintf(stderr, "\n"); + + for (int i = 0; i < n->child_count; i++) { + struct schedule_node *child = n->children[i].node; + if (!child) + continue; + + fprintf(stderr, " - "); + v3d_qpu_dump(devinfo, &child->inst->qpu); + fprintf(stderr, " (%d parents, %c)\n", + child->parent_count, + n->children[i].write_after_read ? 'w' : 'r'); + } + } +} + +static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, + const struct v3d_qpu_instr *after) +{ + /* Apply some huge latency between texture fetch requests and getting + * their results back. + * + * FIXME: This is actually pretty bogus. If we do: + * + * mov tmu0_s, a + * + * mov tmu0_s, b + * load_tmu0 + * + * load_tmu0 + * + * we count that as worse than + * + * mov tmu0_s, a + * mov tmu0_s, b + * + * load_tmu0 + * + * load_tmu0 + * + * because we associate the first load_tmu0 with the *second* tmu0_s. + */ + if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu) + return 100; + + /* Assume that anything depending on us is consuming the SFU result. */ + if (v3d_qpu_magic_waddr_is_sfu(waddr)) + return 3; + + return 1; +} + +static uint32_t +instruction_latency(struct schedule_node *before, struct schedule_node *after) +{ + const struct v3d_qpu_instr *before_inst = &before->inst->qpu; + const struct v3d_qpu_instr *after_inst = &after->inst->qpu; + uint32_t latency = 1; + + if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || + after_inst->type != V3D_QPU_INSTR_TYPE_ALU) + return latency; + + if (before_inst->alu.add.magic_write) { + latency = MAX2(latency, + magic_waddr_latency(before_inst->alu.add.waddr, + after_inst)); + } + + if (before_inst->alu.mul.magic_write) { + latency = MAX2(latency, + magic_waddr_latency(before_inst->alu.mul.waddr, + after_inst)); + } + + return latency; +} + +/** Recursive computation of the delay member of a node. */ +static void +compute_delay(struct schedule_node *n) +{ + if (!n->child_count) { + n->delay = 1; + } else { + for (int i = 0; i < n->child_count; i++) { + if (!n->children[i].node->delay) + compute_delay(n->children[i].node); + n->delay = MAX2(n->delay, + n->children[i].node->delay + + instruction_latency(n, n->children[i].node)); + } + } +} + +static void +mark_instruction_scheduled(struct list_head *schedule_list, + uint32_t time, + struct schedule_node *node, + bool war_only) +{ + if (!node) + return; + + for (int i = node->child_count - 1; i >= 0; i--) { + struct schedule_node *child = + node->children[i].node; + + if (!child) + continue; + + if (war_only && !node->children[i].write_after_read) + continue; + + /* If the requirement is only that the node not appear before + * the last read of its destination, then it can be scheduled + * immediately after (or paired with!) the thing reading the + * destination. + */ + uint32_t latency = 0; + if (!war_only) { + latency = instruction_latency(node, + node->children[i].node); + } + + child->unblocked_time = MAX2(child->unblocked_time, + time + latency); + child->parent_count--; + if (child->parent_count == 0) + list_add(&child->link, schedule_list); + + node->children[i].node = NULL; + } +} + +static struct qinst * +vir_nop() +{ + struct qreg undef = { QFILE_NULL, 0 }; + struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); + + return qinst; +} + +#if 0 +static struct qinst * +nop_after(struct qinst *inst) +{ + struct qinst *q = vir_nop(); + + list_add(&q->link, &inst->link); + + return q; +} + +/** + * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair + * with another instruction. + */ +static void +emit_thrsw(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + /* There should be nothing in a thrsw inst being scheduled other than + * the signal bits. + */ + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + assert(inst->alu.add.op == V3D_QPU_A_NOP); + assert(inst->alu.mul.op == V3D_QPU_M_NOP); + + /* Try to find an earlier scheduled instruction that we can merge the + * thrsw into. + */ + int thrsw_ip = c->qpu_inst_count; + for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { + uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; + uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); + + if (prev_sig == QPU_SIG_NONE) + thrsw_ip = c->qpu_inst_count - i; + } + + if (thrsw_ip != c->qpu_inst_count) { + /* Merge the thrsw into the existing instruction. */ + c->qpu_insts[thrsw_ip] = + QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + + /* Fill the delay slots. */ + while (c->qpu_inst_count < thrsw_ip + 3) { + update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop()); + qpu_serialize_one_inst(c, v3d_qpu_nop()); + } +} +#endif + +static uint32_t +schedule_instructions(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + struct qblock *block, + struct list_head *schedule_list, + enum quniform_contents *orig_uniform_contents, + uint32_t *orig_uniform_data, + uint32_t *next_uniform) +{ + const struct v3d_device_info *devinfo = c->devinfo; + uint32_t time = 0; + + if (debug) { + fprintf(stderr, "initial deps:\n"); + dump_state(devinfo, schedule_list); + fprintf(stderr, "\n"); + } + + /* Remove non-DAG heads from the list. */ + list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { + if (n->parent_count != 0) + list_del(&n->link); + } + + while (!list_empty(schedule_list)) { + struct schedule_node *chosen = + choose_instruction_to_schedule(devinfo, + scoreboard, + schedule_list, + NULL); + struct schedule_node *merge = NULL; + + /* If there are no valid instructions to schedule, drop a NOP + * in. + */ + struct qinst *qinst = chosen ? chosen->inst : vir_nop(); + struct v3d_qpu_instr *inst = &qinst->qpu; + + if (debug) { + fprintf(stderr, "t=%4d: current list:\n", + time); + dump_state(devinfo, schedule_list); + fprintf(stderr, "t=%4d: chose: ", time); + v3d_qpu_dump(devinfo, inst); + fprintf(stderr, "\n"); + } + + /* Schedule this instruction onto the QPU list. Also try to + * find an instruction to pair with it. + */ + if (chosen) { + time = MAX2(chosen->unblocked_time, time); + list_del(&chosen->link); + mark_instruction_scheduled(schedule_list, time, + chosen, true); + + merge = choose_instruction_to_schedule(devinfo, + scoreboard, + schedule_list, + chosen); + if (merge) { + time = MAX2(merge->unblocked_time, time); + list_del(&merge->link); + (void)qpu_merge_inst(devinfo, inst, + inst, &merge->inst->qpu); + if (merge->inst->uniform != -1) { + chosen->inst->uniform = + merge->inst->uniform; + } + + if (debug) { + fprintf(stderr, "t=%4d: merging: ", + time); + v3d_qpu_dump(devinfo, &merge->inst->qpu); + fprintf(stderr, "\n"); + fprintf(stderr, " result: "); + v3d_qpu_dump(devinfo, inst); + fprintf(stderr, "\n"); + } + } + } + + /* Update the uniform index for the rewritten location -- + * branch target updating will still need to change + * c->uniform_data[] using this index. + */ + if (qinst->uniform != -1) { + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) + block->branch_uniform = *next_uniform; + + c->uniform_data[*next_uniform] = + orig_uniform_data[qinst->uniform]; + c->uniform_contents[*next_uniform] = + orig_uniform_contents[qinst->uniform]; + qinst->uniform = *next_uniform; + (*next_uniform)++; + } + + if (debug) { + fprintf(stderr, "\n"); + } + + /* Now that we've scheduled a new instruction, some of its + * children can be promoted to the list of instructions ready to + * be scheduled. Update the children's unblocked time for this + * DAG edge as we do so. + */ + mark_instruction_scheduled(schedule_list, time, chosen, false); + + if (merge) { + mark_instruction_scheduled(schedule_list, time, merge, + false); + + /* The merged VIR instruction doesn't get re-added to the + * block, so free it now. + */ + free(merge->inst); + } + + if (0 && inst->sig.thrsw) { + /* XXX emit_thrsw(c, scoreboard, qinst); */ + } else { + c->qpu_inst_count++; + list_addtail(&qinst->link, &block->instructions); + update_scoreboard_for_chosen(scoreboard, inst); + } + + scoreboard->tick++; + time++; + + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH || + inst->sig.thrsw /* XXX */) { + block->branch_qpu_ip = c->qpu_inst_count - 1; + /* Fill the delay slots. + * + * We should fill these with actual instructions, + * instead, but that will probably need to be done + * after this, once we know what the leading + * instructions of the successors are (so we can + * handle A/B register file write latency) + */ + /* XXX: scoreboard */ + int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ? + 3 : 2); + for (int i = 0; i < slots; i++) { + struct qinst *nop = vir_nop(); + list_addtail(&nop->link, &block->instructions); + + update_scoreboard_for_chosen(scoreboard, + &nop->qpu); + c->qpu_inst_count++; + scoreboard->tick++; + time++; + } + } + } + + return time; +} + +static uint32_t +qpu_schedule_instructions_block(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + struct qblock *block, + enum quniform_contents *orig_uniform_contents, + uint32_t *orig_uniform_data, + uint32_t *next_uniform) +{ + void *mem_ctx = ralloc_context(NULL); + struct list_head schedule_list; + + list_inithead(&schedule_list); + + /* Wrap each instruction in a scheduler structure. */ + while (!list_empty(&block->instructions)) { + struct qinst *qinst = (struct qinst *)block->instructions.next; + struct schedule_node *n = + rzalloc(mem_ctx, struct schedule_node); + + n->inst = qinst; + + list_del(&qinst->link); + list_addtail(&n->link, &schedule_list); + } + + calculate_forward_deps(c, &schedule_list); + calculate_reverse_deps(c, &schedule_list); + + list_for_each_entry(struct schedule_node, n, &schedule_list, link) { + compute_delay(n); + } + + uint32_t cycles = schedule_instructions(c, scoreboard, block, + &schedule_list, + orig_uniform_contents, + orig_uniform_data, + next_uniform); + + ralloc_free(mem_ctx); + + return cycles; +} + +static void +qpu_set_branch_targets(struct v3d_compile *c) +{ + vir_for_each_block(block, c) { + /* The end block of the program has no branch. */ + if (!block->successors[0]) + continue; + + /* If there was no branch instruction, then the successor + * block must follow immediately after this one. + */ + if (block->branch_qpu_ip == ~0) { + assert(block->end_qpu_ip + 1 == + block->successors[0]->start_qpu_ip); + continue; + } + + /* Walk back through the delay slots to find the branch + * instr. + */ + struct list_head *entry = block->instructions.prev; + for (int i = 0; i < 3; i++) + entry = entry->prev; + struct qinst *branch = container_of(entry, branch, link); + assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); + + /* Make sure that the if-we-don't-jump + * successor was scheduled just after the + * delay slots. + */ + assert(!block->successors[1] || + block->successors[1]->start_qpu_ip == + block->branch_qpu_ip + 4); + + branch->qpu.branch.offset = + ((block->successors[0]->start_qpu_ip - + (block->branch_qpu_ip + 4)) * + sizeof(uint64_t)); + + /* Set up the relative offset to jump in the + * uniform stream. + * + * Use a temporary here, because + * uniform_data[inst->uniform] may be shared + * between multiple instructions. + */ + assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); + c->uniform_data[branch->uniform] = + (block->successors[0]->start_uniform - + (block->branch_uniform + 1)) * 4; + } +} + +uint32_t +v3d_qpu_schedule_instructions(struct v3d_compile *c) +{ + const struct v3d_device_info *devinfo = c->devinfo; + + /* We reorder the uniforms as we schedule instructions, so save the + * old data off and replace it. + */ + uint32_t *uniform_data = c->uniform_data; + enum quniform_contents *uniform_contents = c->uniform_contents; + c->uniform_contents = ralloc_array(c, enum quniform_contents, + c->num_uniforms); + c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); + c->uniform_array_size = c->num_uniforms; + uint32_t next_uniform = 0; + + struct choose_scoreboard scoreboard; + memset(&scoreboard, 0, sizeof(scoreboard)); + scoreboard.last_waddr_add = ~0; + scoreboard.last_waddr_mul = ~0; + scoreboard.last_ldvary_tick = -10; + scoreboard.last_sfu_write_tick = -10; + scoreboard.last_uniforms_reset_tick = -10; + + if (debug) { + fprintf(stderr, "Pre-schedule instructions\n"); + vir_for_each_block(block, c) { + fprintf(stderr, "BLOCK %d\n", block->index); + list_for_each_entry(struct qinst, qinst, + &block->instructions, link) { + v3d_qpu_dump(devinfo, &qinst->qpu); + fprintf(stderr, "\n"); + } + } + fprintf(stderr, "\n"); + } + + uint32_t cycles = 0; + vir_for_each_block(block, c) { + block->start_qpu_ip = c->qpu_inst_count; + block->branch_qpu_ip = ~0; + block->start_uniform = next_uniform; + + cycles += qpu_schedule_instructions_block(c, + &scoreboard, + block, + uniform_contents, + uniform_data, + &next_uniform); + + block->end_qpu_ip = c->qpu_inst_count - 1; + } + + qpu_set_branch_targets(c); + + assert(next_uniform == c->num_uniforms); + + return cycles; +} diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c new file mode 100644 index 00000000000..d99d76a8beb --- /dev/null +++ b/src/broadcom/compiler/qpu_validate.c @@ -0,0 +1,208 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file + * + * Validates the QPU instruction sequence after register allocation and + * scheduling. + */ + +#include +#include +#include +#include "v3d_compiler.h" +#include "qpu/qpu_disasm.h" + +struct v3d_qpu_validate_state { + struct v3d_compile *c; + const struct v3d_qpu_instr *last; + int ip; + int last_sfu_write; +}; + +static void +fail_instr(struct v3d_qpu_validate_state *state, const char *msg) +{ + struct v3d_compile *c = state->c; + + fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg); + + int dump_ip = 0; + vir_for_each_inst_inorder(inst, c) { + v3d_qpu_dump(c->devinfo, &inst->qpu); + + if (dump_ip++ == state->ip) + fprintf(stderr, " *** ERROR ***"); + + fprintf(stderr, "\n"); + } + + fprintf(stderr, "\n"); + abort(); +} + +static bool +qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst, + bool (*predicate)(enum v3d_qpu_waddr waddr)) +{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && + predicate(inst->alu.add.waddr)) + return true; + + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && + predicate(inst->alu.mul.waddr)) + return true; + + return false; +} + +static void +qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) +{ + const struct v3d_qpu_instr *inst = &qinst->qpu; + + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return; + + /* LDVARY writes r5 two instructions later and LDUNIF writes + * r5 one instruction later, which is illegal to have + * together. + */ + if (state->last && state->last->sig.ldvary && inst->sig.ldunif) { + fail_instr(state, "LDUNIF after a LDVARY"); + } + + int tmu_writes = 0; + int sfu_writes = 0; + int vpm_writes = 0; + int tlb_writes = 0; + int tsy_writes = 0; + + if (inst->alu.add.op != V3D_QPU_A_NOP) { + if (inst->alu.add.magic_write) { + if (v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) + tmu_writes++; + if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) + sfu_writes++; + if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) + vpm_writes++; + if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr)) + tlb_writes++; + if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) + tsy_writes++; + } + } + + if (inst->alu.mul.op != V3D_QPU_M_NOP) { + if (inst->alu.mul.magic_write) { + if (v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)) + tmu_writes++; + if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) + sfu_writes++; + if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) + vpm_writes++; + if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr)) + tlb_writes++; + if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr)) + tsy_writes++; + } + } + + (void)qpu_magic_waddr_matches; /* XXX */ + + /* SFU r4 results come back two instructions later. No doing + * r4 read/writes or other SFU lookups until it's done. + */ + if (state->ip - state->last_sfu_write < 2) { + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4)) + fail_instr(state, "R4 read too soon after SFU"); + + if (v3d_qpu_writes_r4(inst)) + fail_instr(state, "R4 write too soon after SFU"); + + if (sfu_writes) + fail_instr(state, "SFU write too soon after SFU"); + } + + /* XXX: The docs say VPM can happen with the others, but the simulator + * disagrees. + */ + if (tmu_writes + + sfu_writes + + vpm_writes + + tlb_writes + + tsy_writes + + inst->sig.ldtmu + + inst->sig.ldtlb + + inst->sig.ldvpm + + inst->sig.ldtlbu > 1) { + fail_instr(state, + "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed"); + } + + if (sfu_writes) + state->last_sfu_write = state->ip; +} + +static void +qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block) +{ + vir_for_each_inst(qinst, block) { + qpu_validate_inst(state, qinst); + + state->last = &qinst->qpu; + state->ip++; + } +} + +/** + * Checks for the instruction restrictions from page 37 ("Summary of + * Instruction Restrictions"). + */ +void +qpu_validate(struct v3d_compile *c) +{ + /* We don't want to do validation in release builds, but we want to + * keep compiling the validation code to make sure it doesn't get + * broken. + */ +#ifndef DEBUG + return; +#endif + + struct v3d_qpu_validate_state state = { + .c = c, + .last_sfu_write = -10, + .ip = 0, + }; + + vir_for_each_block(block, c) { + qpu_validate_block(&state, block); + } +} diff --git a/src/broadcom/compiler/v3d_compiler.c b/src/broadcom/compiler/v3d_compiler.c new file mode 100644 index 00000000000..acce09db3fa --- /dev/null +++ b/src/broadcom/compiler/v3d_compiler.c @@ -0,0 +1,43 @@ +/* + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +struct v3d_compiler * +v3d_compiler_init(void) +{ + struct v3d_compile *c = rzalloc(struct v3d_compile); + + return c; +} + +void +v3d_add_qpu_inst(struct v3d_compiler *c, uint64_t inst) +{ + if (c->qpu_inst_count >= c->qpu_inst_size) { + c->qpu_inst_size = MAX2(c->qpu_inst_size * 2, 16); + c->qpu_insts = reralloc(c, c->qpu_insts, uint64_t, + c->qpu_inst_size_array_size); + + } + + c->qpu_insts[c->qpu_inst_count++] = inst; +} diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h new file mode 100644 index 00000000000..e0eeefe245a --- /dev/null +++ b/src/broadcom/compiler/v3d_compiler.h @@ -0,0 +1,927 @@ +/* + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef V3D_COMPILER_H +#define V3D_COMPILER_H + +#include +#include +#include +#include +#include +#include + +#include "util/macros.h" +#include "common/v3d_debug.h" +#include "compiler/nir/nir.h" +#include "util/list.h" +#include "util/u_math.h" + +#include "qpu/qpu_instr.h" +#include "pipe/p_state.h" + +#define V3D_MAX_TEXTURE_SAMPLERS 32 +#define V3D_MAX_SAMPLES 4 +#define V3D_MAX_FS_INPUTS 64 +#define V3D_MAX_VS_INPUTS 64 + +struct nir_builder; + +struct v3d_fs_inputs { + /** + * Array of the meanings of the VPM inputs this shader needs. + * + * It doesn't include those that aren't part of the VPM, like + * point/line coordinates. + */ + struct v3d_varying_slot *input_slots; + uint32_t num_inputs; +}; + +enum qfile { + /** An unused source or destination register. */ + QFILE_NULL, + + /** A physical register, such as the W coordinate payload. */ + QFILE_REG, + /** One of the regsiters for fixed function interactions. */ + QFILE_MAGIC, + + /** + * A virtual register, that will be allocated to actual accumulator + * or physical registers later. + */ + QFILE_TEMP, + QFILE_VARY, + QFILE_UNIF, + QFILE_TLB, + QFILE_TLBU, + + /** + * VPM reads use this with an index value to say what part of the VPM + * is being read. + */ + QFILE_VPM, + + /** + * Stores an immediate value in the index field that will be used + * directly by qpu_load_imm(). + */ + QFILE_LOAD_IMM, + + /** + * Stores an immediate value in the index field that can be turned + * into a small immediate field by qpu_encode_small_immediate(). + */ + QFILE_SMALL_IMM, +}; + +/** + * A reference to a QPU register or a virtual temp register. + */ +struct qreg { + enum qfile file; + uint32_t index; +}; + +static inline struct qreg vir_reg(enum qfile file, uint32_t index) +{ + return (struct qreg){file, index}; +} + +/** + * A reference to an actual register at the QPU level, for register + * allocation. + */ +struct qpu_reg { + bool magic; + int index; +}; + +struct qinst { + /** Entry in qblock->instructions */ + struct list_head link; + + /** + * The instruction being wrapped. Its condition codes, pack flags, + * signals, etc. will all be used, with just the register references + * being replaced by the contents of qinst->dst and qinst->src[]. + */ + struct v3d_qpu_instr qpu; + + /* Pre-register-allocation references to src/dst registers */ + struct qreg dst; + struct qreg src[3]; + bool cond_is_exec_mask; + bool has_implicit_uniform; + + /* After vir_to_qpu.c: If instr reads a uniform, which uniform from + * the uncompiled stream it is. + */ + int uniform; +}; + +enum quniform_contents { + /** + * Indicates that a constant 32-bit value is copied from the program's + * uniform contents. + */ + QUNIFORM_CONSTANT, + /** + * Indicates that the program's uniform contents are used as an index + * into the GL uniform storage. + */ + QUNIFORM_UNIFORM, + + /** @{ + * Scaling factors from clip coordinates to relative to the viewport + * center. + * + * This is used by the coordinate and vertex shaders to produce the + * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed + * point offsets from the viewport ccenter. + */ + QUNIFORM_VIEWPORT_X_SCALE, + QUNIFORM_VIEWPORT_Y_SCALE, + /** @} */ + + QUNIFORM_VIEWPORT_Z_OFFSET, + QUNIFORM_VIEWPORT_Z_SCALE, + + QUNIFORM_USER_CLIP_PLANE, + + /** + * A reference to a texture config parameter 0 uniform. + * + * This is a uniform implicitly loaded with a QPU_W_TMU* write, which + * defines texture type, miplevels, and such. It will be found as a + * parameter to the first QOP_TEX_[STRB] instruction in a sequence. + */ + QUNIFORM_TEXTURE_CONFIG_P0_0, + QUNIFORM_TEXTURE_CONFIG_P0_1, + QUNIFORM_TEXTURE_CONFIG_P0_2, + QUNIFORM_TEXTURE_CONFIG_P0_3, + QUNIFORM_TEXTURE_CONFIG_P0_4, + QUNIFORM_TEXTURE_CONFIG_P0_5, + QUNIFORM_TEXTURE_CONFIG_P0_6, + QUNIFORM_TEXTURE_CONFIG_P0_7, + QUNIFORM_TEXTURE_CONFIG_P0_8, + QUNIFORM_TEXTURE_CONFIG_P0_9, + QUNIFORM_TEXTURE_CONFIG_P0_10, + QUNIFORM_TEXTURE_CONFIG_P0_11, + QUNIFORM_TEXTURE_CONFIG_P0_12, + QUNIFORM_TEXTURE_CONFIG_P0_13, + QUNIFORM_TEXTURE_CONFIG_P0_14, + QUNIFORM_TEXTURE_CONFIG_P0_15, + QUNIFORM_TEXTURE_CONFIG_P0_16, + QUNIFORM_TEXTURE_CONFIG_P0_17, + QUNIFORM_TEXTURE_CONFIG_P0_18, + QUNIFORM_TEXTURE_CONFIG_P0_19, + QUNIFORM_TEXTURE_CONFIG_P0_20, + QUNIFORM_TEXTURE_CONFIG_P0_21, + QUNIFORM_TEXTURE_CONFIG_P0_22, + QUNIFORM_TEXTURE_CONFIG_P0_23, + QUNIFORM_TEXTURE_CONFIG_P0_24, + QUNIFORM_TEXTURE_CONFIG_P0_25, + QUNIFORM_TEXTURE_CONFIG_P0_26, + QUNIFORM_TEXTURE_CONFIG_P0_27, + QUNIFORM_TEXTURE_CONFIG_P0_28, + QUNIFORM_TEXTURE_CONFIG_P0_29, + QUNIFORM_TEXTURE_CONFIG_P0_30, + QUNIFORM_TEXTURE_CONFIG_P0_31, + QUNIFORM_TEXTURE_CONFIG_P0_32, + + /** + * A reference to a texture config parameter 1 uniform. + * + * This is a uniform implicitly loaded with a QPU_W_TMU* write, which + * defines texture width, height, filters, and wrap modes. It will be + * found as a parameter to the second QOP_TEX_[STRB] instruction in a + * sequence. + */ + QUNIFORM_TEXTURE_CONFIG_P1, + + QUNIFORM_TEXTURE_FIRST_LEVEL, + + QUNIFORM_TEXTURE_WIDTH, + QUNIFORM_TEXTURE_HEIGHT, + QUNIFORM_TEXTURE_DEPTH, + QUNIFORM_TEXTURE_ARRAY_SIZE, + QUNIFORM_TEXTURE_LEVELS, + + QUNIFORM_TEXTURE_MSAA_ADDR, + + QUNIFORM_UBO_ADDR, + + QUNIFORM_TEXRECT_SCALE_X, + QUNIFORM_TEXRECT_SCALE_Y, + + QUNIFORM_TEXTURE_BORDER_COLOR, + + QUNIFORM_STENCIL, + + QUNIFORM_ALPHA_REF, + QUNIFORM_SAMPLE_MASK, +}; + +struct v3d_varying_slot { + uint8_t slot_and_component; +}; + +static inline struct v3d_varying_slot +v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component) +{ + assert(slot < 255 / 4); + return (struct v3d_varying_slot){ (slot << 2) + component }; +} + +static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot) +{ + return slot.slot_and_component >> 2; +} + +static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) +{ + return slot.slot_and_component & 3; +} + +struct v3d_ubo_range { + /** + * offset in bytes from the start of the ubo where this range is + * uploaded. + * + * Only set once used is set. + */ + uint32_t dst_offset; + + /** + * offset in bytes from the start of the gallium uniforms where the + * data comes from. + */ + uint32_t src_offset; + + /** size in bytes of this ubo range */ + uint32_t size; +}; + +struct v3d_key { + void *shader_state; + struct { + uint8_t swizzle[4]; + uint8_t return_size; + uint8_t return_channels; + union { + struct { + unsigned compare_mode:1; + unsigned compare_func:3; + unsigned wrap_s:3; + unsigned wrap_t:3; + }; + struct { + uint16_t msaa_width, msaa_height; + }; + }; + } tex[V3D_MAX_TEXTURE_SAMPLERS]; + uint8_t ucp_enables; +}; + +struct v3d_fs_key { + struct v3d_key base; + bool depth_enabled; + bool is_points; + bool is_lines; + bool alpha_test; + bool point_coord_upper_left; + bool light_twoside; + bool msaa; + bool sample_coverage; + bool sample_alpha_to_coverage; + bool sample_alpha_to_one; + bool clamp_color; + bool swap_color_rb; + uint8_t alpha_test_func; + uint8_t logicop_func; + uint32_t point_sprite_mask; + + struct pipe_rt_blend_state blend; +}; + +struct v3d_vs_key { + struct v3d_key base; + + struct v3d_varying_slot fs_inputs[V3D_MAX_FS_INPUTS]; + uint8_t num_fs_inputs; + + bool is_coord; + bool per_vertex_point_size; + bool clamp_color; +}; + +/** A basic block of VIR intructions. */ +struct qblock { + struct list_head link; + + struct list_head instructions; + + struct set *predecessors; + struct qblock *successors[2]; + + int index; + + /* Instruction IPs for the first and last instruction of the block. + * Set by qpu_schedule.c. + */ + uint32_t start_qpu_ip; + uint32_t end_qpu_ip; + + /* Instruction IP for the branch instruction of the block. Set by + * qpu_schedule.c. + */ + uint32_t branch_qpu_ip; + + /** Offset within the uniform stream at the start of the block. */ + uint32_t start_uniform; + /** Offset within the uniform stream of the branch instruction */ + uint32_t branch_uniform; + + /** @{ used by v3d_vir_live_variables.c */ + BITSET_WORD *def; + BITSET_WORD *use; + BITSET_WORD *live_in; + BITSET_WORD *live_out; + int start_ip, end_ip; + /** @} */ +}; + +/** + * Compiler state saved across compiler invocations, for any expensive global + * setup. + */ +struct v3d_compiler { + const struct v3d_device_info *devinfo; + struct ra_regs *regs; + unsigned int reg_class[3]; +}; + +struct v3d_compile { + const struct v3d_device_info *devinfo; + nir_shader *s; + nir_function_impl *impl; + struct exec_list *cf_node_list; + const struct v3d_compiler *compiler; + + /** + * Mapping from nir_register * or nir_ssa_def * to array of struct + * qreg for the values. + */ + struct hash_table *def_ht; + + /* For each temp, the instruction generating its value. */ + struct qinst **defs; + uint32_t defs_array_size; + + /** + * Inputs to the shader, arranged by TGSI declaration order. + * + * Not all fragment shader QFILE_VARY reads are present in this array. + */ + struct qreg *inputs; + struct qreg *outputs; + bool msaa_per_sample_output; + struct qreg color_reads[V3D_MAX_SAMPLES]; + struct qreg sample_colors[V3D_MAX_SAMPLES]; + uint32_t inputs_array_size; + uint32_t outputs_array_size; + uint32_t uniforms_array_size; + + /* Booleans for whether the corresponding QFILE_VARY[i] is + * flat-shaded. This doesn't count gl_FragColor flat-shading, which is + * controlled by shader->color_inputs and rasterizer->flatshade in the + * gallium driver. + */ + BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + struct v3d_ubo_range *ubo_ranges; + bool *ubo_range_used; + uint32_t ubo_ranges_array_size; + /** Number of uniform areas tracked in ubo_ranges. */ + uint32_t num_ubo_ranges; + uint32_t next_ubo_dst_offset; + + /* State for whether we're executing on each channel currently. 0 if + * yes, otherwise a block number + 1 that the channel jumped to. + */ + struct qreg execute; + + struct qreg line_x, point_x, point_y; + + /** + * Instance ID, which comes in before the vertex attribute payload if + * the shader record requests it. + */ + struct qreg iid; + + /** + * Vertex ID, which comes in before the vertex attribute payload + * (after Instance ID) if the shader record requests it. + */ + struct qreg vid; + + /* Fragment shader payload regs. */ + struct qreg payload_w, payload_w_centroid, payload_z; + + /** boolean (~0 -> true) if the fragment has been discarded. */ + struct qreg discard; + + uint8_t vattr_sizes[V3D_MAX_VS_INPUTS]; + uint32_t num_vpm_writes; + + /** + * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. + * + * This includes those that aren't part of the VPM varyings, like + * point/line coordinates. + */ + struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; + + /** + * An entry per outputs[] in the VS indicating what the VARYING_SLOT_* + * of the output is. Used to emit from the VS in the order that the + * FS needs. + */ + struct v3d_varying_slot *output_slots; + + struct pipe_shader_state *shader_state; + struct v3d_key *key; + struct v3d_fs_key *fs_key; + struct v3d_vs_key *vs_key; + + /* Live ranges of temps. */ + int *temp_start, *temp_end; + + uint32_t *uniform_data; + enum quniform_contents *uniform_contents; + uint32_t uniform_array_size; + uint32_t num_uniforms; + uint32_t num_outputs; + uint32_t output_position_index; + nir_variable *output_color_var; + uint32_t output_point_size_index; + uint32_t output_sample_mask_index; + + struct qreg undef; + uint32_t num_temps; + + struct list_head blocks; + int next_block_index; + struct qblock *cur_block; + struct qblock *loop_cont_block; + struct qblock *loop_break_block; + + uint64_t *qpu_insts; + uint32_t qpu_inst_count; + uint32_t qpu_inst_size; + + /* For the FS, the number of varying inputs not counting the + * point/line varyings payload + */ + uint32_t num_inputs; + + /** + * Number of inputs from num_inputs remaining to be queued to the read + * FIFO in the VS/CS. + */ + uint32_t num_inputs_remaining; + + /* Number of inputs currently in the read FIFO for the VS/CS */ + uint32_t num_inputs_in_fifo; + + /** Next offset in the VPM to read from in the VS/CS */ + uint32_t vpm_read_offset; + + uint32_t program_id; + uint32_t variant_id; + + /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH + * is used to hide texturing latency at the cost of limiting ourselves + * to the bottom half of physical reg space. + */ + bool fs_threaded; + + bool last_thrsw_at_top_level; + + bool failed; +}; + +struct v3d_uniform_list { + enum quniform_contents *contents; + uint32_t *data; + uint32_t count; +}; + +struct v3d_prog_data { + struct v3d_uniform_list uniforms; + + struct v3d_ubo_range *ubo_ranges; + uint32_t num_ubo_ranges; + uint32_t ubo_size; + + uint8_t num_inputs; + +}; + +struct v3d_vs_prog_data { + struct v3d_prog_data base; + + bool uses_iid, uses_vid; + + /* Number of components read from each vertex attribute. */ + uint8_t vattr_sizes[32]; + + /* Total number of components read, for the shader state record. */ + uint32_t vpm_input_size; + + /* Total number of components written, for the shader state record. */ + uint32_t vpm_output_size; +}; + +struct v3d_fs_prog_data { + struct v3d_prog_data base; + + struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; + + /** bitmask of which inputs are color inputs, for flat shade handling. */ + uint32_t color_inputs[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + /* Bitmask for whether the corresponding input is flat-shaded, + * independent of rasterizer (gl_FragColor) flat-shading. + */ + BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + bool writes_z; +}; + +/* Special nir_load_input intrinsic index for loading the current TLB + * destination color. + */ +#define V3D_NIR_TLB_COLOR_READ_INPUT 2000000000 + +#define V3D_NIR_MS_MASK_OUTPUT 2000000000 + +extern const nir_shader_compiler_options v3d_nir_options; + +const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo); +void v3d_compiler_free(const struct v3d_compiler *compiler); +void v3d_optimize_nir(struct nir_shader *s); + +uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler, + struct v3d_vs_key *key, + struct v3d_vs_prog_data *prog_data, + nir_shader *s, + int program_id, int variant_id, + uint32_t *final_assembly_size); + +uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler, + struct v3d_fs_key *key, + struct v3d_fs_prog_data *prog_data, + nir_shader *s, + int program_id, int variant_id, + uint32_t *final_assembly_size); + +void v3d_nir_to_vir(struct v3d_compile *c); + +void vir_compile_destroy(struct v3d_compile *c); +const char *vir_get_stage_name(struct v3d_compile *c); +struct qblock *vir_new_block(struct v3d_compile *c); +void vir_set_emit_block(struct v3d_compile *c, struct qblock *block); +void vir_link_blocks(struct qblock *predecessor, struct qblock *successor); +struct qblock *vir_entry_block(struct v3d_compile *c); +struct qblock *vir_exit_block(struct v3d_compile *c); +struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, + struct qreg src0, struct qreg src1); +struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, + struct qreg src0, struct qreg src1); +struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0); +void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst); +struct qreg vir_uniform(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data); +void vir_schedule_instructions(struct v3d_compile *c); +struct v3d_qpu_instr v3d_qpu_nop(void); + +struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst); +struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst); +void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond); +void vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf); +void vir_set_unpack(struct qinst *inst, int src, + enum v3d_qpu_input_unpack unpack); + +struct qreg vir_get_temp(struct v3d_compile *c); +void vir_calculate_live_intervals(struct v3d_compile *c); +bool vir_has_implicit_uniform(struct qinst *inst); +int vir_get_implicit_uniform_src(struct qinst *inst); +int vir_get_non_sideband_nsrc(struct qinst *inst); +int vir_get_nsrc(struct qinst *inst); +bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst); +bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op); +bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op); +bool vir_is_raw_mov(struct qinst *inst); +bool vir_is_tex(struct qinst *inst); +bool vir_is_add(struct qinst *inst); +bool vir_is_mul(struct qinst *inst); +bool vir_is_float_input(struct qinst *inst); +bool vir_depends_on_flags(struct qinst *inst); +bool vir_writes_r3(struct qinst *inst); +bool vir_writes_r4(struct qinst *inst); +struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); +uint8_t vir_channels_written(struct qinst *inst); + +void vir_dump(struct v3d_compile *c); +void vir_dump_inst(struct v3d_compile *c, struct qinst *inst); + +void vir_validate(struct v3d_compile *c); + +void vir_optimize(struct v3d_compile *c); +bool vir_opt_algebraic(struct v3d_compile *c); +bool vir_opt_constant_folding(struct v3d_compile *c); +bool vir_opt_copy_propagate(struct v3d_compile *c); +bool vir_opt_dead_code(struct v3d_compile *c); +bool vir_opt_peephole_sf(struct v3d_compile *c); +bool vir_opt_small_immediates(struct v3d_compile *c); +bool vir_opt_vpm(struct v3d_compile *c); +void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); +void vir_lower_uniforms(struct v3d_compile *c); + +void v3d_vir_to_qpu(struct v3d_compile *c); +uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); +void qpu_validate(struct v3d_compile *c); +struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); +bool vir_init_reg_sets(struct v3d_compiler *compiler); + +void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf); + +static inline bool +quniform_contents_is_texture_p0(enum quniform_contents contents) +{ + return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 && + contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 + + V3D_MAX_TEXTURE_SAMPLERS)); +} + +static inline struct qreg +vir_uniform_ui(struct v3d_compile *c, uint32_t ui) +{ + return vir_uniform(c, QUNIFORM_CONSTANT, ui); +} + +static inline struct qreg +vir_uniform_f(struct v3d_compile *c, float f) +{ + return vir_uniform(c, QUNIFORM_CONSTANT, fui(f)); +} + +#define VIR_ALU0(name, vir_inst, op) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c) \ +{ \ + return vir_emit_def(c, vir_inst(op, c->undef, \ + c->undef, c->undef)); \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, dest, \ + c->undef, c->undef)); \ +} + +#define VIR_ALU1(name, vir_inst, op) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c, struct qreg a) \ +{ \ + return vir_emit_def(c, vir_inst(op, c->undef, \ + a, c->undef)); \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ + struct qreg a) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, dest, a, \ + c->undef)); \ +} + +#define VIR_ALU2(name, vir_inst, op) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ +{ \ + return vir_emit_def(c, vir_inst(op, c->undef, a, b)); \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ + struct qreg a, struct qreg b) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, dest, a, b)); \ +} + +#define VIR_NODST_1(name, vir_inst, op) \ +static inline struct qinst * \ +vir_##name(struct v3d_compile *c, struct qreg a) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, c->undef, \ + a, c->undef)); \ +} + +#define VIR_NODST_2(name, vir_inst, op) \ +static inline struct qinst * \ +vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, c->undef, \ + a, b)); \ +} + +#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) +#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name) +#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name) +#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name) +#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name) +#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name) +#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name) +#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name) +#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name) +#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name) + +VIR_A_ALU2(FADD) +VIR_A_ALU2(VFPACK) +VIR_A_ALU2(FSUB) +VIR_A_ALU2(FMIN) +VIR_A_ALU2(FMAX) + +VIR_A_ALU2(ADD) +VIR_A_ALU2(SUB) +VIR_A_ALU2(SHL) +VIR_A_ALU2(SHR) +VIR_A_ALU2(ASR) +VIR_A_ALU2(ROR) +VIR_A_ALU2(MIN) +VIR_A_ALU2(MAX) +VIR_A_ALU2(UMIN) +VIR_A_ALU2(UMAX) +VIR_A_ALU2(AND) +VIR_A_ALU2(OR) +VIR_A_ALU2(XOR) +VIR_A_ALU2(VADD) +VIR_A_ALU2(VSUB) +VIR_A_ALU1(NOT) +VIR_A_ALU1(NEG) +VIR_A_ALU1(FLAPUSH) +VIR_A_ALU1(FLBPUSH) +VIR_A_ALU1(FLBPOP) +VIR_A_ALU1(SETMSF) +VIR_A_ALU1(SETREVF) +VIR_A_ALU1(TIDX) +VIR_A_ALU1(EIDX) + +VIR_A_ALU0(FXCD) +VIR_A_ALU0(XCD) +VIR_A_ALU0(FYCD) +VIR_A_ALU0(YCD) +VIR_A_ALU0(MSF) +VIR_A_ALU0(REVF) +VIR_A_NODST_1(VPMSETUP) +VIR_A_ALU2(FCMP) +VIR_A_ALU2(VFMAX) + +VIR_A_ALU1(FROUND) +VIR_A_ALU1(FTOIN) +VIR_A_ALU1(FTRUNC) +VIR_A_ALU1(FTOIZ) +VIR_A_ALU1(FFLOOR) +VIR_A_ALU1(FTOUZ) +VIR_A_ALU1(FCEIL) +VIR_A_ALU1(FTOC) + +VIR_A_ALU1(FDX) +VIR_A_ALU1(FDY) + +VIR_A_ALU1(ITOF) +VIR_A_ALU1(CLZ) +VIR_A_ALU1(UTOF) + +VIR_M_ALU2(UMUL24) +VIR_M_ALU2(FMUL) +VIR_M_ALU2(SMUL24) +VIR_M_NODST_2(MULTOP) + +VIR_M_ALU1(MOV) +VIR_M_ALU1(FMOV) + +static inline struct qinst * +vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, + struct qreg dest, struct qreg src) +{ + struct qinst *mov = vir_MOV_dest(c, dest, src); + vir_set_cond(mov, cond); + return mov; +} + +static inline struct qreg +vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond, + struct qreg src0, struct qreg src1) +{ + struct qreg t = vir_get_temp(c); + vir_MOV_dest(c, t, src1); + vir_MOV_cond(c, cond, t, src0); + return t; +} + +static inline void +vir_VPM_WRITE(struct v3d_compile *c, struct qreg val) +{ + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); +} + +static inline struct qinst * +vir_NOP(struct v3d_compile *c) +{ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP, + c->undef, c->undef, c->undef)); +} +/* +static inline struct qreg +vir_LOAD_IMM(struct v3d_compile *c, uint32_t val) +{ + return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef, + vir_reg(QFILE_LOAD_IMM, val), c->undef)); +} + +static inline struct qreg +vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val) +{ + return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef, + vir_reg(QFILE_LOAD_IMM, val), + c->undef)); +} +static inline struct qreg +vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val) +{ + return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef, + vir_reg(QFILE_LOAD_IMM, val), + c->undef)); +} +*/ + +static inline struct qinst * +vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond) +{ + /* The actual uniform_data value will be set at scheduling time */ + return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0))); +} + +#define vir_for_each_block(block, c) \ + list_for_each_entry(struct qblock, block, &c->blocks, link) + +#define vir_for_each_block_rev(block, c) \ + list_for_each_entry_rev(struct qblock, block, &c->blocks, link) + +/* Loop over the non-NULL members of the successors array. */ +#define vir_for_each_successor(succ, block) \ + for (struct qblock *succ = block->successors[0]; \ + succ != NULL; \ + succ = (succ == block->successors[1] ? NULL : \ + block->successors[1])) + +#define vir_for_each_inst(inst, block) \ + list_for_each_entry(struct qinst, inst, &block->instructions, link) + +#define vir_for_each_inst_rev(inst, block) \ + list_for_each_entry_rev(struct qinst, inst, &block->instructions, link) + +#define vir_for_each_inst_safe(inst, block) \ + list_for_each_entry_safe(struct qinst, inst, &block->instructions, link) + +#define vir_for_each_inst_inorder(inst, c) \ + vir_for_each_block(_block, c) \ + vir_for_each_inst(inst, _block) + +#endif /* V3D_COMPILER_H */ diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c new file mode 100644 index 00000000000..9cdcc02195c --- /dev/null +++ b/src/broadcom/compiler/v3d_nir_lower_io.c @@ -0,0 +1,176 @@ +/* + * Copyright © 2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/v3d_compiler.h" +#include "compiler/nir/nir_builder.h" + +/** + * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io + * intrinsics into something amenable to the V3D architecture. + * + * Currently, it splits VS inputs and uniforms into scalars, drops any + * non-position outputs in coordinate shaders, and fixes up the addressing on + * indirect uniform loads. FS input and VS output scalarization is handled by + * nir_lower_io_to_scalar(). + */ + +static void +replace_intrinsic_with_vec(nir_builder *b, nir_intrinsic_instr *intr, + nir_ssa_def **comps) +{ + + /* Batch things back together into a vector. This will get split by + * the later ALU scalarization pass. + */ + nir_ssa_def *vec = nir_vec(b, comps, intr->num_components); + + /* Replace the old intrinsic with a reference to our reconstructed + * vector. + */ + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec)); + nir_instr_remove(&intr->instr); +} + +static void +v3d_nir_lower_output(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + nir_variable *output_var = NULL; + nir_foreach_variable(var, &c->s->outputs) { + if (var->data.driver_location == nir_intrinsic_base(intr)) { + output_var = var; + break; + } + } + assert(output_var); + + if (c->vs_key) { + int slot = output_var->data.location; + bool used = false; + + switch (slot) { + case VARYING_SLOT_PSIZ: + case VARYING_SLOT_POS: + used = true; + break; + + default: + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + if (v3d_slot_get_slot(c->vs_key->fs_inputs[i]) == slot) { + used = true; + break; + } + } + break; + } + + if (!used) + nir_instr_remove(&intr->instr); + } +} + +static void +v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *intr) +{ + b->cursor = nir_before_instr(&intr->instr); + + /* Generate scalar loads equivalent to the original vector. */ + nir_ssa_def *dests[4]; + for (unsigned i = 0; i < intr->num_components; i++) { + nir_intrinsic_instr *intr_comp = + nir_intrinsic_instr_create(c->s, intr->intrinsic); + intr_comp->num_components = 1; + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); + + /* Convert the uniform offset to bytes. If it happens + * to be a constant, constant-folding will clean up + * the shift for us. + */ + nir_intrinsic_set_base(intr_comp, + nir_intrinsic_base(intr) * 16 + + i * 4); + + intr_comp->src[0] = + nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa, + nir_imm_int(b, 4))); + + dests[i] = &intr_comp->dest.ssa; + + nir_builder_instr_insert(b, &intr_comp->instr); + } + + replace_intrinsic_with_vec(b, intr, dests); +} + +static void +v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, + struct nir_instr *instr) +{ + if (instr->type != nir_instr_type_intrinsic) + return; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + break; + + case nir_intrinsic_store_output: + v3d_nir_lower_output(c, b, intr); + break; + + case nir_intrinsic_load_uniform: + v3d_nir_lower_uniform(c, b, intr); + break; + + case nir_intrinsic_load_user_clip_plane: + default: + break; + } +} + +static bool +v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl) +{ + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) + v3d_nir_lower_io_instr(c, &b, instr); + } + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + + return true; +} + +void +v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) +{ + nir_foreach_function(function, s) { + if (function->impl) + v3d_nir_lower_io_impl(c, function->impl); + } +} diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c new file mode 100644 index 00000000000..35df757a208 --- /dev/null +++ b/src/broadcom/compiler/vir.c @@ -0,0 +1,907 @@ +/* + * Copyright © 2016-2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_compiler.h" + +int +vir_get_non_sideband_nsrc(struct qinst *inst) +{ + switch (inst->qpu.type) { + case V3D_QPU_INSTR_TYPE_BRANCH: + return 0; + case V3D_QPU_INSTR_TYPE_ALU: + if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) + return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); + else + return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); + } + + return 0; +} + +int +vir_get_nsrc(struct qinst *inst) +{ + int nsrc = vir_get_non_sideband_nsrc(inst); + + if (vir_has_implicit_uniform(inst)) + nsrc++; + + return nsrc; +} + +bool +vir_has_implicit_uniform(struct qinst *inst) +{ + switch (inst->qpu.type) { + case V3D_QPU_INSTR_TYPE_BRANCH: + return true; + case V3D_QPU_INSTR_TYPE_ALU: + switch (inst->dst.file) { + case QFILE_TLBU: + return true; + default: + return inst->has_implicit_uniform; + } + } + return false; +} + +/* The sideband uniform for textures gets stored after the normal ALU + * arguments. + */ +int +vir_get_implicit_uniform_src(struct qinst *inst) +{ + return vir_get_nsrc(inst) - 1; +} + +/** + * Returns whether the instruction has any side effects that must be + * preserved. + */ +bool +vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) +{ + switch (inst->qpu.type) { + case V3D_QPU_INSTR_TYPE_BRANCH: + return true; + case V3D_QPU_INSTR_TYPE_ALU: + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_SETREVF: + case V3D_QPU_A_SETMSF: + case V3D_QPU_A_VPMSETUP: + return true; + default: + break; + } + + switch (inst->qpu.alu.mul.op) { + case V3D_QPU_M_MULTOP: + return true; + default: + break; + } + } + + if (inst->qpu.sig.ldtmu) + return true; + + return false; +} + +bool +vir_is_float_input(struct qinst *inst) +{ + /* XXX: More instrs */ + switch (inst->qpu.type) { + case V3D_QPU_INSTR_TYPE_BRANCH: + return false; + case V3D_QPU_INSTR_TYPE_ALU: + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_FADD: + case V3D_QPU_A_FSUB: + case V3D_QPU_A_FMIN: + case V3D_QPU_A_FMAX: + case V3D_QPU_A_FTOIN: + return true; + default: + break; + } + + switch (inst->qpu.alu.mul.op) { + case V3D_QPU_M_FMOV: + case V3D_QPU_M_VFMUL: + case V3D_QPU_M_FMUL: + return true; + default: + break; + } + } + + return false; +} + +bool +vir_is_raw_mov(struct qinst *inst) +{ + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV && + inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) { + return false; + } + + if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || + inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { + return false; + } + + if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || + inst->qpu.flags.mc != V3D_QPU_COND_NONE) + return false; + + return true; +} + +bool +vir_is_add(struct qinst *inst) +{ + return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + inst->qpu.alu.add.op != V3D_QPU_A_NOP); +} + +bool +vir_is_mul(struct qinst *inst) +{ + return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + inst->qpu.alu.mul.op != V3D_QPU_M_NOP); +} + +bool +vir_is_tex(struct qinst *inst) +{ + if (inst->dst.file == QFILE_MAGIC) + return v3d_qpu_magic_waddr_is_tmu(inst->dst.index); + + return false; +} + +bool +vir_depends_on_flags(struct qinst *inst) +{ + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { + return (inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS); + } else { + return (inst->qpu.flags.ac != V3D_QPU_COND_NONE && + inst->qpu.flags.mc != V3D_QPU_COND_NONE); + } +} + +bool +vir_writes_r3(struct qinst *inst) +{ + for (int i = 0; i < vir_get_nsrc(inst); i++) { + switch (inst->src[i].file) { + case QFILE_VARY: + case QFILE_VPM: + return true; + default: + break; + } + } + + return false; +} + +bool +vir_writes_r4(struct qinst *inst) +{ + switch (inst->dst.file) { + case QFILE_MAGIC: + switch (inst->dst.index) { + case V3D_QPU_WADDR_RECIP: + case V3D_QPU_WADDR_RSQRT: + case V3D_QPU_WADDR_EXP: + case V3D_QPU_WADDR_LOG: + case V3D_QPU_WADDR_SIN: + return true; + } + break; + default: + break; + } + + if (inst->qpu.sig.ldtmu) + return true; + + return false; +} + +void +vir_set_unpack(struct qinst *inst, int src, + enum v3d_qpu_input_unpack unpack) +{ + assert(src == 0 || src == 1); + + if (vir_is_add(inst)) { + if (src == 0) + inst->qpu.alu.add.a_unpack = unpack; + else + inst->qpu.alu.add.b_unpack = unpack; + } else { + assert(vir_is_mul(inst)); + if (src == 0) + inst->qpu.alu.mul.a_unpack = unpack; + else + inst->qpu.alu.mul.b_unpack = unpack; + } +} + +void +vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond) +{ + if (vir_is_add(inst)) { + inst->qpu.flags.ac = cond; + } else { + assert(vir_is_mul(inst)); + inst->qpu.flags.mc = cond; + } +} + +void +vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf) +{ + if (vir_is_add(inst)) { + inst->qpu.flags.apf = pf; + } else { + assert(vir_is_mul(inst)); + inst->qpu.flags.mpf = pf; + } +} + +#if 0 +uint8_t +vir_channels_written(struct qinst *inst) +{ + if (vir_is_mul(inst)) { + switch (inst->dst.pack) { + case QPU_PACK_MUL_NOP: + case QPU_PACK_MUL_8888: + return 0xf; + case QPU_PACK_MUL_8A: + return 0x1; + case QPU_PACK_MUL_8B: + return 0x2; + case QPU_PACK_MUL_8C: + return 0x4; + case QPU_PACK_MUL_8D: + return 0x8; + } + } else { + switch (inst->dst.pack) { + case QPU_PACK_A_NOP: + case QPU_PACK_A_8888: + case QPU_PACK_A_8888_SAT: + case QPU_PACK_A_32_SAT: + return 0xf; + case QPU_PACK_A_8A: + case QPU_PACK_A_8A_SAT: + return 0x1; + case QPU_PACK_A_8B: + case QPU_PACK_A_8B_SAT: + return 0x2; + case QPU_PACK_A_8C: + case QPU_PACK_A_8C_SAT: + return 0x4; + case QPU_PACK_A_8D: + case QPU_PACK_A_8D_SAT: + return 0x8; + case QPU_PACK_A_16A: + case QPU_PACK_A_16A_SAT: + return 0x3; + case QPU_PACK_A_16B: + case QPU_PACK_A_16B_SAT: + return 0xc; + } + } + unreachable("Bad pack field"); +} +#endif + +struct qreg +vir_get_temp(struct v3d_compile *c) +{ + struct qreg reg; + + reg.file = QFILE_TEMP; + reg.index = c->num_temps++; + + if (c->num_temps > c->defs_array_size) { + uint32_t old_size = c->defs_array_size; + c->defs_array_size = MAX2(old_size * 2, 16); + c->defs = reralloc(c, c->defs, struct qinst *, + c->defs_array_size); + memset(&c->defs[old_size], 0, + sizeof(c->defs[0]) * (c->defs_array_size - old_size)); + } + + return reg; +} + +struct qinst * +vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1) +{ + struct qinst *inst = calloc(1, sizeof(*inst)); + + inst->qpu = v3d_qpu_nop(); + inst->qpu.alu.add.op = op; + + inst->dst = dst; + inst->src[0] = src0; + inst->src[1] = src1; + inst->uniform = ~0; + + return inst; +} + +struct qinst * +vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1) +{ + struct qinst *inst = calloc(1, sizeof(*inst)); + + inst->qpu = v3d_qpu_nop(); + inst->qpu.alu.mul.op = op; + + inst->dst = dst; + inst->src[0] = src0; + inst->src[1] = src1; + inst->uniform = ~0; + + return inst; +} + +struct qinst * +vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src) +{ + struct qinst *inst = calloc(1, sizeof(*inst)); + + inst->qpu = v3d_qpu_nop(); + inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH; + inst->qpu.branch.cond = cond; + inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE; + inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL; + inst->qpu.branch.ub = true; + inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL; + + inst->dst = vir_reg(QFILE_NULL, 0); + inst->src[0] = src; + inst->uniform = ~0; + + return inst; +} + +static void +vir_emit(struct v3d_compile *c, struct qinst *inst) +{ + list_addtail(&inst->link, &c->cur_block->instructions); + + if (inst->dst.file == QFILE_MAGIC && + inst->dst.index == V3D_QPU_WADDR_VPM) + c->num_vpm_writes++; +} + +/* Updates inst to write to a new temporary, emits it, and notes the def. */ +struct qreg +vir_emit_def(struct v3d_compile *c, struct qinst *inst) +{ + assert(inst->dst.file == QFILE_NULL); + + inst->dst = vir_get_temp(c); + + if (inst->dst.file == QFILE_TEMP) + c->defs[inst->dst.index] = inst; + + vir_emit(c, inst); + + return inst->dst; +} + +struct qinst * +vir_emit_nondef(struct v3d_compile *c, struct qinst *inst) +{ + if (inst->dst.file == QFILE_TEMP) + c->defs[inst->dst.index] = NULL; + + vir_emit(c, inst); + + return inst; +} + +struct qblock * +vir_new_block(struct v3d_compile *c) +{ + struct qblock *block = rzalloc(c, struct qblock); + + list_inithead(&block->instructions); + + block->predecessors = _mesa_set_create(block, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + block->index = c->next_block_index++; + + return block; +} + +void +vir_set_emit_block(struct v3d_compile *c, struct qblock *block) +{ + c->cur_block = block; + list_addtail(&block->link, &c->blocks); +} + +struct qblock * +vir_entry_block(struct v3d_compile *c) +{ + return list_first_entry(&c->blocks, struct qblock, link); +} + +struct qblock * +vir_exit_block(struct v3d_compile *c) +{ + return list_last_entry(&c->blocks, struct qblock, link); +} + +void +vir_link_blocks(struct qblock *predecessor, struct qblock *successor) +{ + _mesa_set_add(successor->predecessors, predecessor); + if (predecessor->successors[0]) { + assert(!predecessor->successors[1]); + predecessor->successors[1] = successor; + } else { + predecessor->successors[0] = successor; + } +} + +const struct v3d_compiler * +v3d_compiler_init(const struct v3d_device_info *devinfo) +{ + struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); + if (!compiler) + return NULL; + + compiler->devinfo = devinfo; + + if (!vir_init_reg_sets(compiler)) { + ralloc_free(compiler); + return NULL; + } + + return compiler; +} + +void +v3d_compiler_free(const struct v3d_compiler *compiler) +{ + ralloc_free((void *)compiler); +} + +static struct v3d_compile * +vir_compile_init(const struct v3d_compiler *compiler, + struct v3d_key *key, + nir_shader *s, + int program_id, int variant_id) +{ + struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); + + c->compiler = compiler; + c->devinfo = compiler->devinfo; + c->key = key; + c->program_id = program_id; + c->variant_id = variant_id; + + s = nir_shader_clone(c, s); + c->s = s; + + list_inithead(&c->blocks); + vir_set_emit_block(c, vir_new_block(c)); + + c->output_position_index = -1; + c->output_point_size_index = -1; + c->output_sample_mask_index = -1; + + c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + return c; +} + +static void +v3d_lower_nir(struct v3d_compile *c) +{ + struct nir_lower_tex_options tex_options = { + .lower_rect = false, /* XXX */ + .lower_txp = ~0, + /* Apply swizzles to all samplers. */ + .swizzle_result = ~0, + }; + + /* Lower the format swizzle and (for 32-bit returns) + * ARB_texture_swizzle-style swizzle. + */ + for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) { + for (int j = 0; j < 4; j++) + tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j]; + } + + NIR_PASS_V(c->s, nir_lower_tex, &tex_options); +} + +static void +v3d_lower_nir_late(struct v3d_compile *c) +{ + NIR_PASS_V(c->s, v3d_nir_lower_io, c); + NIR_PASS_V(c->s, nir_lower_idiv); +} + +static void +v3d_set_prog_data_uniforms(struct v3d_compile *c, + struct v3d_prog_data *prog_data) +{ + int count = c->num_uniforms; + struct v3d_uniform_list *ulist = &prog_data->uniforms; + + ulist->count = count; + ulist->data = ralloc_array(prog_data, uint32_t, count); + memcpy(ulist->data, c->uniform_data, + count * sizeof(*ulist->data)); + ulist->contents = ralloc_array(prog_data, enum quniform_contents, count); + memcpy(ulist->contents, c->uniform_contents, + count * sizeof(*ulist->contents)); +} + +/* Copy the compiler UBO range state to the compiled shader, dropping out + * arrays that were never referenced by an indirect load. + * + * (Note that QIR dead code elimination of an array access still leaves that + * array alive, though) + */ +static void +v3d_set_prog_data_ubo(struct v3d_compile *c, + struct v3d_prog_data *prog_data) +{ + if (!c->num_ubo_ranges) + return; + + prog_data->num_ubo_ranges = 0; + prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range, + c->num_ubo_ranges); + for (int i = 0; i < c->num_ubo_ranges; i++) { + if (!c->ubo_range_used[i]) + continue; + + struct v3d_ubo_range *range = &c->ubo_ranges[i]; + prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range; + prog_data->ubo_size += range->size; + } + + if (prog_data->ubo_size) { + if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, + prog_data->ubo_size / 4); + } + } +} + +static void +v3d_set_prog_data(struct v3d_compile *c, + struct v3d_prog_data *prog_data) +{ + v3d_set_prog_data_uniforms(c, prog_data); + v3d_set_prog_data_ubo(c, prog_data); +} + +static uint64_t * +v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size) +{ + *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t); + + uint64_t *qpu_insts = malloc(*final_assembly_size); + if (!qpu_insts) + return NULL; + + memcpy(qpu_insts, c->qpu_insts, *final_assembly_size); + + vir_compile_destroy(c); + + return qpu_insts; +} + +uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler, + struct v3d_vs_key *key, + struct v3d_vs_prog_data *prog_data, + nir_shader *s, + int program_id, int variant_id, + uint32_t *final_assembly_size) +{ + struct v3d_compile *c = vir_compile_init(compiler, &key->base, s, + program_id, variant_id); + + c->vs_key = key; + + v3d_lower_nir(c); + + if (key->clamp_color) + NIR_PASS_V(c->s, nir_lower_clamp_color_outputs); + + if (key->base.ucp_enables) { + NIR_PASS_V(c->s, nir_lower_clip_vs, key->base.ucp_enables); + NIR_PASS_V(c->s, nir_lower_io_to_scalar, + nir_var_shader_out); + } + + /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); + + v3d_lower_nir_late(c); + v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_convert_from_ssa, true); + + v3d_nir_to_vir(c); + + v3d_set_prog_data(c, &prog_data->base); + + prog_data->base.num_inputs = c->num_inputs; + + /* The vertex data gets format converted by the VPM so that + * each attribute channel takes up a VPM column. Precompute + * the sizes for the shader record. + */ + for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { + prog_data->vattr_sizes[i] = c->vattr_sizes[i]; + prog_data->vpm_input_size += c->vattr_sizes[i]; + } + + /* Input/output segment size are in 8x32-bit multiples. */ + prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; + prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8; + + prog_data->uses_vid = (s->info.system_values_read & + (1ull << SYSTEM_VALUE_VERTEX_ID)); + prog_data->uses_iid = (s->info.system_values_read & + (1ull << SYSTEM_VALUE_INSTANCE_ID)); + + return v3d_return_qpu_insts(c, final_assembly_size); +} + +static void +v3d_set_fs_prog_data_inputs(struct v3d_compile *c, + struct v3d_fs_prog_data *prog_data) +{ + prog_data->base.num_inputs = c->num_inputs; + memcpy(prog_data->input_slots, c->input_slots, + c->num_inputs * sizeof(*c->input_slots)); + + for (int i = 0; i < c->num_inputs; i++) { + struct v3d_varying_slot v3d_slot = c->input_slots[i]; + uint8_t slot = v3d_slot_get_slot(v3d_slot); + + if (slot == VARYING_SLOT_COL0 || + slot == VARYING_SLOT_COL1 || + slot == VARYING_SLOT_BFC0 || + slot == VARYING_SLOT_BFC1) { + BITSET_SET(prog_data->color_inputs, i); + } + + if (BITSET_TEST(c->flat_shade_flags, i)) + BITSET_SET(prog_data->flat_shade_flags, i); + } +} + +uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler, + struct v3d_fs_key *key, + struct v3d_fs_prog_data *prog_data, + nir_shader *s, + int program_id, int variant_id, + uint32_t *final_assembly_size) +{ + struct v3d_compile *c = vir_compile_init(compiler, &key->base, s, + program_id, variant_id); + + c->fs_key = key; + + v3d_lower_nir(c); + + if (key->light_twoside) + NIR_PASS_V(c->s, nir_lower_two_sided_color); + + if (key->clamp_color) + NIR_PASS_V(c->s, nir_lower_clamp_color_outputs); + + if (key->alpha_test) { + NIR_PASS_V(c->s, nir_lower_alpha_test, key->alpha_test_func, + false); + } + + if (key->base.ucp_enables) + NIR_PASS_V(c->s, nir_lower_clip_fs, key->base.ucp_enables); + + /* Note: FS input scalarizing must happen after + * nir_lower_two_sided_color, which only handles a vec4 at a time. + */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); + + v3d_lower_nir_late(c); + v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_convert_from_ssa, true); + + v3d_nir_to_vir(c); + + v3d_set_prog_data(c, &prog_data->base); + v3d_set_fs_prog_data_inputs(c, prog_data); + if (c->s->info.outputs_written & (1 << FRAG_RESULT_DEPTH)) + prog_data->writes_z = true; + + return v3d_return_qpu_insts(c, final_assembly_size); +} + +void +vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst) +{ + if (qinst->dst.file == QFILE_TEMP) + c->defs[qinst->dst.index] = NULL; + + list_del(&qinst->link); + free(qinst); +} + +struct qreg +vir_follow_movs(struct v3d_compile *c, struct qreg reg) +{ + /* XXX + int pack = reg.pack; + + while (reg.file == QFILE_TEMP && + c->defs[reg.index] && + (c->defs[reg.index]->op == QOP_MOV || + c->defs[reg.index]->op == QOP_FMOV) && + !c->defs[reg.index]->dst.pack && + !c->defs[reg.index]->src[0].pack) { + reg = c->defs[reg.index]->src[0]; + } + + reg.pack = pack; + */ + return reg; +} + +void +vir_compile_destroy(struct v3d_compile *c) +{ + vir_for_each_block(block, c) { + while (!list_empty(&block->instructions)) { + struct qinst *qinst = + list_first_entry(&block->instructions, + struct qinst, link); + vir_remove_instruction(c, qinst); + } + } + + ralloc_free(c); +} + +struct qreg +vir_uniform(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data) +{ + for (int i = 0; i < c->num_uniforms; i++) { + if (c->uniform_contents[i] == contents && + c->uniform_data[i] == data) { + return vir_reg(QFILE_UNIF, i); + } + } + + uint32_t uniform = c->num_uniforms++; + + if (uniform >= c->uniform_array_size) { + c->uniform_array_size = MAX2(MAX2(16, uniform + 1), + c->uniform_array_size * 2); + + c->uniform_data = reralloc(c, c->uniform_data, + uint32_t, + c->uniform_array_size); + c->uniform_contents = reralloc(c, c->uniform_contents, + enum quniform_contents, + c->uniform_array_size); + } + + c->uniform_contents[uniform] = contents; + c->uniform_data[uniform] = data; + + return vir_reg(QFILE_UNIF, uniform); +} + +void +vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf) +{ + struct qinst *last_inst = NULL; + + if (!list_empty(&c->cur_block->instructions)) + last_inst = (struct qinst *)c->cur_block->instructions.prev; + + if (src.file != QFILE_TEMP || + !c->defs[src.index] || + last_inst != c->defs[src.index]) { + /* XXX: Make the MOV be the appropriate type */ + last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src); + last_inst = (struct qinst *)c->cur_block->instructions.prev; + } + + vir_set_pf(last_inst, pf); +} + +#define OPTPASS(func) \ + do { \ + bool stage_progress = func(c); \ + if (stage_progress) { \ + progress = true; \ + if (print_opt_debug) { \ + fprintf(stderr, \ + "VIR opt pass %2d: %s progress\n", \ + pass, #func); \ + } \ + /*XXX vir_validate(c);*/ \ + } \ + } while (0) + +void +vir_optimize(struct v3d_compile *c) +{ + bool print_opt_debug = false; + int pass = 1; + + while (true) { + bool progress = false; + + OPTPASS(vir_opt_copy_propagate); + OPTPASS(vir_opt_dead_code); + + if (!progress) + break; + + pass++; + } +} + +const char * +vir_get_stage_name(struct v3d_compile *c) +{ + if (c->vs_key && c->vs_key->is_coord) + return "MESA_SHADER_COORD"; + else + return gl_shader_stage_name(c->s->stage); +} diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c new file mode 100644 index 00000000000..ad5c061a138 --- /dev/null +++ b/src/broadcom/compiler/vir_dump.c @@ -0,0 +1,339 @@ +/* + * Copyright © 2016-2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_compiler.h" + +static void +vir_print_reg(struct v3d_compile *c, struct qreg reg) +{ + static const char *files[] = { + [QFILE_TEMP] = "t", + [QFILE_VARY] = "v", + [QFILE_UNIF] = "u", + [QFILE_TLB] = "tlb", + [QFILE_TLBU] = "tlbu", + }; + static const char *quniform_names[] = { + [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale", + [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale", + [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset", + [QUNIFORM_VIEWPORT_Z_SCALE] = "vp_z_scale", + }; + + switch (reg.file) { + + case QFILE_NULL: + fprintf(stderr, "null"); + break; + + case QFILE_LOAD_IMM: + fprintf(stderr, "0x%08x (%f)", reg.index, uif(reg.index)); + break; + + case QFILE_REG: + fprintf(stderr, "rf%d", reg.index); + break; + + case QFILE_MAGIC: + fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index)); + break; + + case QFILE_SMALL_IMM: + if ((int)reg.index >= -16 && (int)reg.index <= 15) + fprintf(stderr, "%d", reg.index); + else + fprintf(stderr, "%f", uif(reg.index)); + break; + + case QFILE_VPM: + fprintf(stderr, "vpm%d.%d", + reg.index / 4, reg.index % 4); + break; + + case QFILE_TLB: + fprintf(stderr, "%s", files[reg.file]); + break; + + case QFILE_UNIF: { + enum quniform_contents contents = c->uniform_contents[reg.index]; + + fprintf(stderr, "%s%d", files[reg.file], reg.index); + + switch (contents) { + case QUNIFORM_CONSTANT: + fprintf(stderr, " (0x%08x / %f)", + c->uniform_data[reg.index], + uif(c->uniform_data[reg.index])); + break; + + case QUNIFORM_UNIFORM: + fprintf(stderr, " (push[%d])", + c->uniform_data[reg.index]); + break; + + case QUNIFORM_TEXTURE_CONFIG_P1: + fprintf(stderr, " (tex[%d].p1)", + c->uniform_data[reg.index]); + break; + + case QUNIFORM_TEXTURE_WIDTH: + fprintf(stderr, " (tex[%d].width)", + c->uniform_data[reg.index]); + break; + case QUNIFORM_TEXTURE_HEIGHT: + fprintf(stderr, " (tex[%d].height)", + c->uniform_data[reg.index]); + break; + case QUNIFORM_TEXTURE_DEPTH: + fprintf(stderr, " (tex[%d].depth)", + c->uniform_data[reg.index]); + break; + case QUNIFORM_TEXTURE_ARRAY_SIZE: + fprintf(stderr, " (tex[%d].array_size)", + c->uniform_data[reg.index]); + break; + case QUNIFORM_TEXTURE_LEVELS: + fprintf(stderr, " (tex[%d].levels)", + c->uniform_data[reg.index]); + break; + + case QUNIFORM_UBO_ADDR: + fprintf(stderr, " (ubo[%d])", + c->uniform_data[reg.index]); + break; + + default: + if (quniform_contents_is_texture_p0(contents)) { + fprintf(stderr, " (tex[%d].p0: 0x%08x)", + contents - QUNIFORM_TEXTURE_CONFIG_P0_0, + c->uniform_data[reg.index]); + } else if (contents < ARRAY_SIZE(quniform_names)) { + fprintf(stderr, " (%s)", + quniform_names[contents]); + } else { + fprintf(stderr, " (%d / 0x%08x)", contents, + c->uniform_data[reg.index]); + } + } + + break; + } + + default: + fprintf(stderr, "%s%d", files[reg.file], reg.index); + break; + } +} + +static void +vir_dump_sig(struct v3d_compile *c, struct qinst *inst) +{ + struct v3d_qpu_sig *sig = &inst->qpu.sig; + + if (sig->thrsw) + fprintf(stderr, "; thrsw"); + if (sig->ldvary) + fprintf(stderr, "; ldvary"); + if (sig->ldvpm) + fprintf(stderr, "; ldvpm"); + if (sig->ldtmu) + fprintf(stderr, "; ldtmu"); + if (sig->ldunif) + fprintf(stderr, "; ldunif"); + if (sig->wrtmuc) + fprintf(stderr, "; wrtmuc"); +} + +static void +vir_dump_alu(struct v3d_compile *c, struct qinst *inst) +{ + struct v3d_qpu_instr *instr = &inst->qpu; + int nsrc = vir_get_non_sideband_nsrc(inst); + int sideband_nsrc = vir_get_nsrc(inst); + enum v3d_qpu_input_unpack unpack[2]; + + if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) { + fprintf(stderr, "%s", v3d_qpu_add_op_name(instr->alu.add.op)); + fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.ac)); + fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.apf)); + fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf)); + fprintf(stderr, " "); + + vir_print_reg(c, inst->dst); + fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); + + unpack[0] = instr->alu.add.a_unpack; + unpack[1] = instr->alu.add.b_unpack; + } else { + fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); + fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc)); + fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.mpf)); + fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf)); + fprintf(stderr, " "); + + vir_print_reg(c, inst->dst); + fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); + + unpack[0] = instr->alu.mul.a_unpack; + unpack[1] = instr->alu.mul.b_unpack; + } + + for (int i = 0; i < sideband_nsrc; i++) { + fprintf(stderr, ", "); + vir_print_reg(c, inst->src[i]); + if (i < nsrc) + fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i])); + } + + vir_dump_sig(c, inst); +} + +void +vir_dump_inst(struct v3d_compile *c, struct qinst *inst) +{ + struct v3d_qpu_instr *instr = &inst->qpu; + + switch (inst->qpu.type) { + case V3D_QPU_INSTR_TYPE_ALU: + vir_dump_alu(c, inst); + break; + case V3D_QPU_INSTR_TYPE_BRANCH: + fprintf(stderr, "b"); + if (instr->branch.ub) + fprintf(stderr, "u"); + + fprintf(stderr, "%s", + v3d_qpu_branch_cond_name(instr->branch.cond)); + fprintf(stderr, "%s", v3d_qpu_msfign_name(instr->branch.msfign)); + + switch (instr->branch.bdi) { + case V3D_QPU_BRANCH_DEST_ABS: + fprintf(stderr, " zero_addr+0x%08x", instr->branch.offset); + break; + + case V3D_QPU_BRANCH_DEST_REL: + fprintf(stderr, " %d", instr->branch.offset); + break; + + case V3D_QPU_BRANCH_DEST_LINK_REG: + fprintf(stderr, " lri"); + break; + + case V3D_QPU_BRANCH_DEST_REGFILE: + fprintf(stderr, " rf%d", instr->branch.raddr_a); + break; + } + + if (instr->branch.ub) { + switch (instr->branch.bdu) { + case V3D_QPU_BRANCH_DEST_ABS: + fprintf(stderr, ", a:unif"); + break; + + case V3D_QPU_BRANCH_DEST_REL: + fprintf(stderr, ", r:unif"); + break; + + case V3D_QPU_BRANCH_DEST_LINK_REG: + fprintf(stderr, ", lri"); + break; + + case V3D_QPU_BRANCH_DEST_REGFILE: + fprintf(stderr, ", rf%d", instr->branch.raddr_a); + break; + } + } + + if (vir_has_implicit_uniform(inst)) { + fprintf(stderr, " "); + vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]); + } + + break; + } +} + +void +vir_dump(struct v3d_compile *c) +{ + int ip = 0; + + vir_for_each_block(block, c) { + fprintf(stderr, "BLOCK %d:\n", block->index); + vir_for_each_inst(inst, block) { + if (c->temp_start) { + bool first = true; + + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] != ip) + continue; + + if (first) { + first = false; + } else { + fprintf(stderr, ", "); + } + fprintf(stderr, "S%4d", i); + } + + if (first) + fprintf(stderr, " "); + else + fprintf(stderr, " "); + } + + if (c->temp_end) { + bool first = true; + + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_end[i] != ip) + continue; + + if (first) { + first = false; + } else { + fprintf(stderr, ", "); + } + fprintf(stderr, "E%4d", i); + } + + if (first) + fprintf(stderr, " "); + else + fprintf(stderr, " "); + } + + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + ip++; + } + if (block->successors[1]) { + fprintf(stderr, "-> BLOCK %d, %d\n", + block->successors[0]->index, + block->successors[1]->index); + } else if (block->successors[0]) { + fprintf(stderr, "-> BLOCK %d\n", + block->successors[0]->index); + } + } +} diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c new file mode 100644 index 00000000000..217b716fd9f --- /dev/null +++ b/src/broadcom/compiler/vir_live_variables.c @@ -0,0 +1,340 @@ +/* + * Copyright © 2012 Intel Corporation + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define MAX_INSTRUCTION (1 << 30) + +#include "util/ralloc.h" +#include "util/register_allocate.h" +#include "v3d_compiler.h" + +struct partial_update_state { + struct qinst *insts[4]; + uint8_t channels; +}; + +static uint32_t +int_hash(const void *key) +{ + return _mesa_hash_data(key, sizeof(int)); +} + +static bool +int_compare(const void *key1, const void *key2) +{ + return *(const int *)key1 == *(const int *)key2; +} + +static int +vir_reg_to_var(struct qreg reg) +{ + if (reg.file == QFILE_TEMP) + return reg.index; + + return -1; +} + +static void +vir_setup_use(struct v3d_compile *c, struct qblock *block, int ip, + struct qreg src) +{ + int var = vir_reg_to_var(src); + if (var == -1) + return; + + c->temp_start[var] = MIN2(c->temp_start[var], ip); + c->temp_end[var] = MAX2(c->temp_end[var], ip); + + /* The use[] bitset marks when the block makes + * use of a variable without having completely + * defined that variable within the block. + */ + if (!BITSET_TEST(block->def, var)) + BITSET_SET(block->use, var); +} + +static struct partial_update_state * +get_partial_update_state(struct hash_table *partial_update_ht, + struct qinst *inst) +{ + struct hash_entry *entry = + _mesa_hash_table_search(partial_update_ht, + &inst->dst.index); + if (entry) + return entry->data; + + struct partial_update_state *state = + rzalloc(partial_update_ht, struct partial_update_state); + + _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state); + + return state; +} + +static void +vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip, + struct hash_table *partial_update_ht, struct qinst *inst) +{ + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + return; + + /* The def[] bitset marks when an initialization in a + * block completely screens off previous updates of + * that variable. + */ + int var = vir_reg_to_var(inst->dst); + if (var == -1) + return; + + c->temp_start[var] = MIN2(c->temp_start[var], ip); + c->temp_end[var] = MAX2(c->temp_end[var], ip); + + /* If we've already tracked this as a def, or already used it within + * the block, there's nothing to do. + */ + if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var)) + return; + + /* Easy, common case: unconditional full register update. + * + * We treat conditioning on the exec mask as the same as not being + * conditional. This makes sure that if the register gets set on + * either side of an if, it is treated as being screened off before + * the if. Otherwise, if there was no intervening def, its live + * interval doesn't extend back to the start of he program, and if too + * many registers did that we'd fail to register allocate. + */ + if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE && + inst->qpu.flags.mc == V3D_QPU_COND_NONE) || + inst->cond_is_exec_mask) && + inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE && + inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) { + BITSET_SET(block->def, var); + return; + } + + /* Finally, look at the condition code and packing and mark it as a + * def. We need to make sure that we understand sequences + * instructions like: + * + * mov.zs t0, t1 + * mov.zc t0, t2 + * + * or: + * + * mmov t0.8a, t1 + * mmov t0.8b, t2 + * mmov t0.8c, t3 + * mmov t0.8d, t4 + * + * as defining the temp within the block, because otherwise dst's live + * range will get extended up the control flow to the top of the + * program. + */ + struct partial_update_state *state = + get_partial_update_state(partial_update_ht, inst); + uint8_t mask = 0xf; /* XXX vir_channels_written(inst); */ + + if (inst->qpu.flags.ac == V3D_QPU_COND_NONE && + inst->qpu.flags.mc == V3D_QPU_COND_NONE) { + state->channels |= mask; + } else { + for (int i = 0; i < 4; i++) { + if (!(mask & (1 << i))) + continue; + + /* XXXif (state->insts[i] && + state->insts[i]->cond == + qpu_cond_complement(inst->cond)) + state->channels |= 1 << i; + else + */ + state->insts[i] = inst; + } + } + + if (state->channels == 0xf) + BITSET_SET(block->def, var); +} + +static void +sf_state_clear(struct hash_table *partial_update_ht) +{ + struct hash_entry *entry; + + hash_table_foreach(partial_update_ht, entry) { + struct partial_update_state *state = entry->data; + + for (int i = 0; i < 4; i++) { + if (state->insts[i] && + (state->insts[i]->qpu.flags.ac != V3D_QPU_COND_NONE || + state->insts[i]->qpu.flags.mc != V3D_QPU_COND_NONE)) + state->insts[i] = NULL; + } + } +} + +/* Sets up the def/use arrays for when variables are used-before-defined or + * defined-before-used in the block. + * + * Also initializes the temp_start/temp_end to cover just the instruction IPs + * where the variable is used, which will be extended later in + * vir_compute_start_end(). + */ +static void +vir_setup_def_use(struct v3d_compile *c) +{ + struct hash_table *partial_update_ht = + _mesa_hash_table_create(c, int_hash, int_compare); + int ip = 0; + + vir_for_each_block(block, c) { + block->start_ip = ip; + + _mesa_hash_table_clear(partial_update_ht, NULL); + + vir_for_each_inst(inst, block) { + for (int i = 0; i < vir_get_nsrc(inst); i++) + vir_setup_use(c, block, ip, inst->src[i]); + + vir_setup_def(c, block, ip, partial_update_ht, inst); + + if (false /* XXX inst->uf */) + sf_state_clear(partial_update_ht); + + /* Payload registers: r0/1/2 contain W, centroid W, + * and Z at program start. Register allocation will + * force their nodes to R0/1/2. + */ + if (inst->src[0].file == QFILE_REG) { + switch (inst->src[0].index) { + case 0: + case 1: + case 2: + c->temp_start[inst->dst.index] = 0; + break; + } + } + + ip++; + } + block->end_ip = ip; + } + + _mesa_hash_table_destroy(partial_update_ht, NULL); +} + +static bool +vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words) +{ + bool cont = false; + + vir_for_each_block_rev(block, c) { + /* Update live_out: Any successor using the variable + * on entrance needs us to have the variable live on + * exit. + */ + vir_for_each_successor(succ, block) { + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_live_out = (succ->live_in[i] & + ~block->live_out[i]); + if (new_live_out) { + block->live_out[i] |= new_live_out; + cont = true; + } + } + } + + /* Update live_in */ + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_live_in = (block->use[i] | + (block->live_out[i] & + ~block->def[i])); + if (new_live_in & ~block->live_in[i]) { + block->live_in[i] |= new_live_in; + cont = true; + } + } + } + + return cont; +} + +/** + * Extend the start/end ranges for each variable to account for the + * new information calculated from control flow. + */ +static void +vir_compute_start_end(struct v3d_compile *c, int num_vars) +{ + vir_for_each_block(block, c) { + for (int i = 0; i < num_vars; i++) { + if (BITSET_TEST(block->live_in, i)) { + c->temp_start[i] = MIN2(c->temp_start[i], + block->start_ip); + c->temp_end[i] = MAX2(c->temp_end[i], + block->start_ip); + } + + if (BITSET_TEST(block->live_out, i)) { + c->temp_start[i] = MIN2(c->temp_start[i], + block->end_ip); + c->temp_end[i] = MAX2(c->temp_end[i], + block->end_ip); + } + } + } +} + +void +vir_calculate_live_intervals(struct v3d_compile *c) +{ + int bitset_words = BITSET_WORDS(c->num_temps); + + /* If we called this function more than once, then we should be + * freeing the previous arrays. + */ + assert(!c->temp_start); + + c->temp_start = rzalloc_array(c, int, c->num_temps); + c->temp_end = rzalloc_array(c, int, c->num_temps); + + for (int i = 0; i < c->num_temps; i++) { + c->temp_start[i] = MAX_INSTRUCTION; + c->temp_end[i] = -1; + } + + vir_for_each_block(block, c) { + block->def = rzalloc_array(c, BITSET_WORD, bitset_words); + block->use = rzalloc_array(c, BITSET_WORD, bitset_words); + block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words); + block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words); + } + + vir_setup_def_use(c); + + while (vir_live_variables_dataflow(c, bitset_words)) + ; + + vir_compute_start_end(c, c->num_temps); +} diff --git a/src/broadcom/compiler/vir_lower_uniforms.c b/src/broadcom/compiler/vir_lower_uniforms.c new file mode 100644 index 00000000000..b2741994a2d --- /dev/null +++ b/src/broadcom/compiler/vir_lower_uniforms.c @@ -0,0 +1,209 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file v3d_vir_lower_uniforms.c + * + * This is the pre-code-generation pass for fixing up instructions that try to + * read from multiple uniform values. + */ + +#include "v3d_compiler.h" +#include "util/hash_table.h" +#include "util/u_math.h" + +static inline uint32_t +index_hash(const void *key) +{ + return (uintptr_t)key; +} + +static inline bool +index_compare(const void *a, const void *b) +{ + return a == b; +} + +static void +add_uniform(struct hash_table *ht, struct qreg reg) +{ + struct hash_entry *entry; + void *key = (void *)(uintptr_t)(reg.index + 1); + + entry = _mesa_hash_table_search(ht, key); + if (entry) { + entry->data++; + } else { + _mesa_hash_table_insert(ht, key, (void *)(uintptr_t)1); + } +} + +static void +remove_uniform(struct hash_table *ht, struct qreg reg) +{ + struct hash_entry *entry; + void *key = (void *)(uintptr_t)(reg.index + 1); + + entry = _mesa_hash_table_search(ht, key); + assert(entry); + entry->data--; + if (entry->data == NULL) + _mesa_hash_table_remove(ht, entry); +} + +static bool +is_lowerable_uniform(struct qinst *inst, int i) +{ + if (inst->src[i].file != QFILE_UNIF) + return false; + if (vir_has_implicit_uniform(inst)) + return i != vir_get_implicit_uniform_src(inst); + return true; +} + +/* Returns the number of different uniform values referenced by the + * instruction. + */ +static uint32_t +vir_get_instruction_uniform_count(struct qinst *inst) +{ + uint32_t count = 0; + + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_UNIF) + continue; + + bool is_duplicate = false; + for (int j = 0; j < i; j++) { + if (inst->src[j].file == QFILE_UNIF && + inst->src[j].index == inst->src[i].index) { + is_duplicate = true; + break; + } + } + if (!is_duplicate) + count++; + } + + return count; +} + +void +vir_lower_uniforms(struct v3d_compile *c) +{ + struct hash_table *ht = + _mesa_hash_table_create(c, index_hash, index_compare); + + /* Walk the instruction list, finding which instructions have more + * than one uniform referenced, and add those uniform values to the + * ht. + */ + vir_for_each_inst_inorder(inst, c) { + uint32_t nsrc = vir_get_nsrc(inst); + + if (vir_get_instruction_uniform_count(inst) <= 1) + continue; + + for (int i = 0; i < nsrc; i++) { + if (is_lowerable_uniform(inst, i)) + add_uniform(ht, inst->src[i]); + } + } + + while (ht->entries) { + /* Find the most commonly used uniform in instructions that + * need a uniform lowered. + */ + uint32_t max_count = 0; + uint32_t max_index = 0; + struct hash_entry *entry; + hash_table_foreach(ht, entry) { + uint32_t count = (uintptr_t)entry->data; + uint32_t index = (uintptr_t)entry->key - 1; + if (count > max_count) { + max_count = count; + max_index = index; + } + } + + struct qreg unif = vir_reg(QFILE_UNIF, max_index); + + /* Now, find the instructions using this uniform and make them + * reference a temp instead. + */ + vir_for_each_block(block, c) { + struct qinst *mov = NULL; + + vir_for_each_inst(inst, block) { + uint32_t nsrc = vir_get_nsrc(inst); + + uint32_t count = vir_get_instruction_uniform_count(inst); + + if (count <= 1) + continue; + + /* If the block doesn't have a load of the + * uniform yet, add it. We could potentially + * do better and CSE MOVs from multiple blocks + * into dominating blocks, except that may + * cause troubles for register allocation. + */ + if (!mov) { + mov = vir_mul_inst(V3D_QPU_M_MOV, + vir_get_temp(c), + unif, c->undef); + list_add(&mov->link, + &block->instructions); + c->defs[mov->dst.index] = mov; + } + + bool removed = false; + for (int i = 0; i < nsrc; i++) { + if (is_lowerable_uniform(inst, i) && + inst->src[i].index == max_index) { + inst->src[i].file = + mov->dst.file; + inst->src[i].index = + mov->dst.index; + remove_uniform(ht, unif); + removed = true; + } + } + if (removed) + count--; + + /* If the instruction doesn't need lowering any more, + * then drop it from the list. + */ + if (count <= 1) { + for (int i = 0; i < nsrc; i++) { + if (is_lowerable_uniform(inst, i)) + remove_uniform(ht, inst->src[i]); + } + } + } + } + } + + _mesa_hash_table_destroy(ht, NULL); +} diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c new file mode 100644 index 00000000000..2a22a1b5521 --- /dev/null +++ b/src/broadcom/compiler/vir_opt_copy_propagate.c @@ -0,0 +1,233 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file v3d_opt_copy_propagation.c + * + * This implements simple copy propagation for VIR without control flow. + * + * For each temp, it keeps a qreg of which source it was MOVed from, if it + * was. If we see that used later, we can just reuse the source value, since + * we know we don't have control flow, and we have SSA for our values so + * there's no killing to worry about. + */ + +#include "v3d_compiler.h" + +static bool +is_copy_mov(struct qinst *inst) +{ + if (!inst) + return false; + + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV && + inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) { + return false; + } + + if (inst->dst.file != QFILE_TEMP) + return false; + + if (inst->src[0].file != QFILE_TEMP && + inst->src[0].file != QFILE_UNIF) { + return false; + } + + if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || + inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { + return false; + } + + if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || + inst->qpu.flags.mc != V3D_QPU_COND_NONE) { + return false; + } + + switch (inst->src[0].file) { + case QFILE_MAGIC: + /* No copy propagating from R3/R4/R5 -- the MOVs from those + * are there to register allocate values produced into R3/4/5 + * to other regs (though hopefully r3/4/5). + */ + switch (inst->src[0].index) { + case V3D_QPU_WADDR_R3: + case V3D_QPU_WADDR_R4: + case V3D_QPU_WADDR_R5: + return false; + default: + break; + } + break; + + case QFILE_REG: + switch (inst->src[0].index) { + case 0: + case 1: + case 2: + /* MOVs from rf0/1/2 are only to track the live + * intervals for W/centroid W/Z. + */ + return false; + } + break; + + default: + break; + } + + return true; +} + +static bool +vir_has_unpack(struct qinst *inst, int chan) +{ + assert(chan == 0 || chan == 1); + + if (vir_is_add(inst)) { + if (chan == 0) + return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE; + else + return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE; + } else { + if (chan == 0) + return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE; + else + return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE; + } +} + +static bool +try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) +{ + bool debug = false; + bool progress = false; + + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_TEMP) + continue; + + /* We have two ways of finding MOVs we can copy propagate + * from. One is if it's an SSA def: then we can reuse it from + * any block in the program, as long as its source is also an + * SSA def. Alternatively, if it's in the "movs" array + * tracked within the block, then we know the sources for it + * haven't been changed since we saw the instruction within + * our block. + */ + struct qinst *mov = movs[inst->src[i].index]; + if (!mov) { + if (!is_copy_mov(c->defs[inst->src[i].index])) + continue; + mov = c->defs[inst->src[i].index]; + + if (mov->src[0].file == QFILE_TEMP && + !c->defs[mov->src[0].index]) + continue; + } + + if (vir_has_unpack(mov, 0)) { + /* Make sure that the meaning of the unpack + * would be the same between the two + * instructions. + */ + if (vir_is_float_input(inst) != + vir_is_float_input(mov)) { + continue; + } + /* No composing the unpacks. */ + if (vir_has_unpack(inst, i)) + continue; + } + + if (debug) { + fprintf(stderr, "Copy propagate: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + + inst->src[i] = mov->src[0]; + if (vir_has_unpack(mov, 0)) { + enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack; + + vir_set_unpack(inst, i, unpack); + } + + if (debug) { + fprintf(stderr, "to: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + + progress = true; + } + + return progress; +} + +static void +apply_kills(struct v3d_compile *c, struct qinst **movs, struct qinst *inst) +{ + if (inst->dst.file != QFILE_TEMP) + return; + + for (int i = 0; i < c->num_temps; i++) { + if (movs[i] && + (movs[i]->dst.index == inst->dst.index || + (movs[i]->src[0].file == QFILE_TEMP && + movs[i]->src[0].index == inst->dst.index))) { + movs[i] = NULL; + } + } +} + +bool +vir_opt_copy_propagate(struct v3d_compile *c) +{ + bool progress = false; + struct qinst **movs; + + movs = ralloc_array(c, struct qinst *, c->num_temps); + if (!movs) + return false; + + vir_for_each_block(block, c) { + /* The MOVs array tracks only available movs within the + * block. + */ + memset(movs, 0, sizeof(struct qinst *) * c->num_temps); + + vir_for_each_inst(inst, block) { + progress = try_copy_prop(c, inst, movs) || progress; + + apply_kills(c, movs, inst); + + if (is_copy_mov(inst)) + movs[inst->dst.index] = inst; + } + } + + ralloc_free(movs); + + return progress; +} diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c new file mode 100644 index 00000000000..9e0ef20b6db --- /dev/null +++ b/src/broadcom/compiler/vir_opt_dead_code.c @@ -0,0 +1,162 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file v3d_opt_dead_code.c + * + * This is a simple dead code eliminator for SSA values in VIR. + * + * It walks all the instructions finding what temps are used, then walks again + * to remove instructions writing unused temps. + * + * This is an inefficient implementation if you have long chains of + * instructions where the entire chain is dead, but we expect those to have + * been eliminated at the NIR level, and here we're just cleaning up small + * problems produced by NIR->VIR. + */ + +#include "v3d_compiler.h" + +static bool debug; + +static void +dce(struct v3d_compile *c, struct qinst *inst) +{ + if (debug) { + fprintf(stderr, "Removing: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + assert(inst->qpu.flags.apf == V3D_QPU_PF_NONE); + assert(inst->qpu.flags.mpf == V3D_QPU_PF_NONE); + vir_remove_instruction(c, inst); +} + +static bool +has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst) +{ + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file == QFILE_VPM) { + /* Instance ID, Vertex ID: Should have been removed at + * the NIR level + */ + if (inst->src[i].index == ~0) + return true; + + uint32_t attr = inst->src[i].index / 4; + uint32_t offset = inst->src[i].index % 4; + + if (c->vattr_sizes[attr] != offset) + return true; + + /* Can't get rid of the last VPM read, or the + * simulator (at least) throws an error. + */ + uint32_t total_size = 0; + for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) + total_size += c->vattr_sizes[i]; + if (total_size == 1) + return true; + } + + /* Dead code removal of varyings is tricky, so just assert + * that it all happened at the NIR level. + */ + if (inst->src[i].file == QFILE_VARY) + return true; + } + + return false; +} + +bool +vir_opt_dead_code(struct v3d_compile *c) +{ + bool progress = false; + bool *used = calloc(c->num_temps, sizeof(bool)); + + vir_for_each_inst_inorder(inst, c) { + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file == QFILE_TEMP) + used[inst->src[i].index] = true; + } + } + + vir_for_each_block(block, c) { + vir_for_each_inst_safe(inst, block) { + if (inst->dst.file != QFILE_NULL && + !(inst->dst.file == QFILE_TEMP && + !used[inst->dst.index])) { + continue; + } + + if (vir_has_side_effects(c, inst)) + continue; + + if (inst->qpu.flags.apf != V3D_QPU_PF_NONE || + inst->qpu.flags.mpf != V3D_QPU_PF_NONE|| + has_nonremovable_reads(c, inst)) { + /* If we can't remove the instruction, but we + * don't need its destination value, just + * remove the destination. The register + * allocator would trivially color it and it + * wouldn't cause any register pressure, but + * it's nicer to read the VIR code without + * unused destination regs. + */ + if (inst->dst.file == QFILE_TEMP) { + if (debug) { + fprintf(stderr, + "Removing dst from: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + c->defs[inst->dst.index] = NULL; + inst->dst.file = QFILE_NULL; + progress = true; + } + continue; + } + + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_VPM) + continue; + uint32_t attr = inst->src[i].index / 4; + uint32_t offset = (inst->src[i].index % 4); + + if (c->vattr_sizes[attr] == offset) { + c->num_inputs--; + c->vattr_sizes[attr]--; + } + } + + dce(c, inst); + progress = true; + continue; + } + } + + free(used); + + return progress; +} diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c new file mode 100644 index 00000000000..9ebf2cd69b4 --- /dev/null +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -0,0 +1,254 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/ralloc.h" +#include "util/register_allocate.h" +#include "v3d_compiler.h" + +#define QPU_R(i) { .magic = false, .index = i } + +#define ACC_INDEX 0 +#define ACC_COUNT 5 +#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) +#define PHYS_COUNT 64 + +bool +vir_init_reg_sets(struct v3d_compiler *compiler) +{ + compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, + true); + if (!compiler->regs) + return false; + + /* Allocate 3 regfile classes, for the ways the physical register file + * can be divided up for fragment shader threading. + */ + for (int threads = 0; threads < 3; threads++) { + compiler->reg_class[threads] = + ra_alloc_reg_class(compiler->regs); + + for (int i = PHYS_INDEX; + i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { + ra_class_add_reg(compiler->regs, + compiler->reg_class[threads], i); + } + + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) { + ra_class_add_reg(compiler->regs, + compiler->reg_class[threads], i); + } + } + + ra_set_finalize(compiler->regs, NULL); + + return true; +} + +struct node_to_temp_map { + uint32_t temp; + uint32_t priority; +}; + +static int +node_to_temp_priority(const void *in_a, const void *in_b) +{ + const struct node_to_temp_map *a = in_a; + const struct node_to_temp_map *b = in_b; + + return a->priority - b->priority; +} + +#define CLASS_BIT_PHYS (1 << 0) +#define CLASS_BIT_R0_R2 (1 << 1) +#define CLASS_BIT_R3 (1 << 2) +#define CLASS_BIT_R4 (1 << 3) + +/** + * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. + * + * The return value should be freed by the caller. + */ +struct qpu_reg * +v3d_register_allocate(struct v3d_compile *c) +{ + struct node_to_temp_map map[c->num_temps]; + uint32_t temp_to_node[c->num_temps]; + uint8_t class_bits[c->num_temps]; + struct qpu_reg *temp_registers = calloc(c->num_temps, + sizeof(*temp_registers)); + int acc_nodes[ACC_COUNT]; + + struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs, + c->num_temps + + ARRAY_SIZE(acc_nodes)); + + /* Make some fixed nodes for the accumulators, which we will need to + * interfere with when ops have implied r3/r4 writes or for the thread + * switches. We could represent these as classes for the nodes to + * live in, but the classes take up a lot of memory to set up, so we + * don't want to make too many. + */ + for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) { + acc_nodes[i] = c->num_temps + i; + ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i); + } + + /* Compute the live ranges so we can figure out interference. */ + vir_calculate_live_intervals(c); + + for (uint32_t i = 0; i < c->num_temps; i++) { + map[i].temp = i; + map[i].priority = c->temp_end[i] - c->temp_start[i]; + } + qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); + for (uint32_t i = 0; i < c->num_temps; i++) { + temp_to_node[map[i].temp] = i; + } + + /* Figure out our register classes and preallocated registers. We + * start with any temp being able to be in any file, then instructions + * incrementally remove bits that the temp definitely can't be in. + */ + memset(class_bits, + CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4, + sizeof(class_bits)); + + int ip = 0; + vir_for_each_inst_inorder(inst, c) { + /* If the instruction writes r3/r4 (and optionally moves its + * result to a temp), nothing else can be stored in r3/r4 across + * it. + */ + if (vir_writes_r3(inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && + c->temp_end[i] > ip) { + ra_add_node_interference(g, + temp_to_node[i], + acc_nodes[3]); + } + } + } + if (vir_writes_r4(inst)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && + c->temp_end[i] > ip) { + ra_add_node_interference(g, + temp_to_node[i], + acc_nodes[4]); + } + } + } + + if (inst->src[0].file == QFILE_REG) { + switch (inst->src[0].index) { + case 0: + case 1: + case 2: + /* Payload setup instructions: Force allocate + * the dst to the given register (so the MOV + * will disappear). + */ + assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); + assert(inst->dst.file == QFILE_TEMP); + ra_set_node_reg(g, + temp_to_node[inst->dst.index], + PHYS_INDEX + + inst->src[0].index); + break; + } + } + +#if 0 + switch (inst->op) { + case QOP_THRSW: + /* All accumulators are invalidated across a thread + * switch. + */ + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) + class_bits[i] &= ~(CLASS_BIT_R0_R3 | + CLASS_BIT_R4); + } + break; + + default: + break; + } +#endif + + ip++; + } + + for (uint32_t i = 0; i < c->num_temps; i++) { + ra_set_node_class(g, temp_to_node[i], + c->compiler->reg_class[c->fs_threaded]); + } + + for (uint32_t i = 0; i < c->num_temps; i++) { + for (uint32_t j = i + 1; j < c->num_temps; j++) { + if (!(c->temp_start[i] >= c->temp_end[j] || + c->temp_start[j] >= c->temp_end[i])) { + ra_add_node_interference(g, + temp_to_node[i], + temp_to_node[j]); + } + } + } + + bool ok = ra_allocate(g); + if (!ok) { + if (!c->fs_threaded) { + fprintf(stderr, "Failed to register allocate:\n"); + vir_dump(c); + } + + c->failed = true; + free(temp_registers); + return NULL; + } + + for (uint32_t i = 0; i < c->num_temps; i++) { + int ra_reg = ra_get_node_reg(g, temp_to_node[i]); + if (ra_reg < PHYS_INDEX) { + temp_registers[i].magic = true; + temp_registers[i].index = (V3D_QPU_WADDR_R0 + + ra_reg - ACC_INDEX); + } else { + temp_registers[i].magic = false; + temp_registers[i].index = ra_reg - PHYS_INDEX; + } + + /* If the value's never used, just write to the NOP register + * for clarity in debug output. + */ + if (c->temp_start[i] == c->temp_end[i]) { + temp_registers[i].magic = true; + temp_registers[i].index = V3D_QPU_WADDR_NOP; + } + } + + ralloc_free(g); + + return temp_registers; +} diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c new file mode 100644 index 00000000000..78bcea1e302 --- /dev/null +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -0,0 +1,359 @@ +/* + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/v3d_compiler.h" +#include "qpu/qpu_instr.h" +#include "qpu/qpu_disasm.h" + +static inline struct qpu_reg +qpu_reg(int index) +{ + struct qpu_reg reg = { + .magic = false, + .index = index, + }; + return reg; +} + +static inline struct qpu_reg +qpu_magic(enum v3d_qpu_waddr waddr) +{ + struct qpu_reg reg = { + .magic = true, + .index = waddr, + }; + return reg; +} + +static inline struct qpu_reg +qpu_acc(int acc) +{ + return qpu_magic(V3D_QPU_WADDR_R0 + acc); +} + +struct v3d_qpu_instr +v3d_qpu_nop(void) +{ + struct v3d_qpu_instr instr = { + .type = V3D_QPU_INSTR_TYPE_ALU, + .alu = { + .add = { + .op = V3D_QPU_A_NOP, + .waddr = V3D_QPU_WADDR_NOP, + .magic_write = true, + }, + .mul = { + .op = V3D_QPU_M_NOP, + .waddr = V3D_QPU_WADDR_NOP, + .magic_write = true, + }, + } + }; + + return instr; +} + +static struct qinst * +vir_nop(void) +{ + struct qreg undef = { QFILE_NULL, 0 }; + struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); + + return qinst; +} + +static struct qinst * +new_qpu_nop_before(struct qinst *inst) +{ + struct qinst *q = vir_nop(); + + list_addtail(&q->link, &inst->link); + + return q; +} + +static void +new_ldunif_instr(struct qinst *inst, int i) +{ + struct qinst *ldunif = new_qpu_nop_before(inst); + + ldunif->qpu.sig.ldunif = true; + assert(inst->src[i].file == QFILE_UNIF); + ldunif->uniform = inst->src[i].index; +} + +/** + * Allocates the src register (accumulator or register file) into the RADDR + * fields of the instruction. + */ +static void +set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) +{ + if (src.magic) { + assert(src.index >= V3D_QPU_WADDR_R0 && + src.index <= V3D_QPU_WADDR_R5); + *mux = src.index - V3D_QPU_WADDR_R0 + V3D_QPU_MUX_R0; + return; + } + + if (instr->alu.add.a != V3D_QPU_MUX_A && + instr->alu.add.b != V3D_QPU_MUX_A && + instr->alu.mul.a != V3D_QPU_MUX_A && + instr->alu.mul.b != V3D_QPU_MUX_A) { + instr->raddr_a = src.index; + *mux = V3D_QPU_MUX_A; + } else { + if (instr->raddr_a == src.index) { + *mux = V3D_QPU_MUX_A; + } else { + assert(!(instr->alu.add.a == V3D_QPU_MUX_B && + instr->alu.add.b == V3D_QPU_MUX_B && + instr->alu.mul.a == V3D_QPU_MUX_B && + instr->alu.mul.b == V3D_QPU_MUX_B) || + src.index == instr->raddr_b); + + instr->raddr_b = src.index; + *mux = V3D_QPU_MUX_B; + } + } +} + +static void +v3d_generate_code_block(struct v3d_compile *c, + struct qblock *block, + struct qpu_reg *temp_registers) +{ + int last_vpm_read_index = -1; + + vir_for_each_inst(qinst, block) { +#if 0 + fprintf(stderr, "translating qinst to qpu: "); + vir_dump_inst(c, qinst); + fprintf(stderr, "\n"); +#endif + + struct qinst *temp; + + if (vir_has_implicit_uniform(qinst)) { + int src = vir_get_implicit_uniform_src(qinst); + assert(qinst->src[src].file == QFILE_UNIF); + qinst->uniform = qinst->src[src].index; + c->num_uniforms++; + } + + int nsrc = vir_get_non_sideband_nsrc(qinst); + struct qpu_reg src[ARRAY_SIZE(qinst->src)]; + bool emitted_ldunif = false; + for (int i = 0; i < nsrc; i++) { + int index = qinst->src[i].index; + switch (qinst->src[i].file) { + case QFILE_REG: + src[i] = qpu_reg(qinst->src[i].index); + break; + case QFILE_MAGIC: + src[i] = qpu_magic(qinst->src[i].index); + break; + case QFILE_NULL: + case QFILE_LOAD_IMM: + src[i] = qpu_acc(0); + break; + case QFILE_TEMP: + src[i] = temp_registers[index]; + break; + case QFILE_UNIF: + if (!emitted_ldunif) { + new_ldunif_instr(qinst, i); + c->num_uniforms++; + emitted_ldunif = true; + } + + src[i] = qpu_acc(5); + break; + case QFILE_VARY: + temp = new_qpu_nop_before(qinst); + temp->qpu.sig.ldvary = true; + + src[i] = qpu_acc(3); + break; + case QFILE_SMALL_IMM: + abort(); /* XXX */ +#if 0 + src[i].mux = QPU_MUX_SMALL_IMM; + src[i].addr = qpu_encode_small_immediate(qinst->src[i].index); + /* This should only have returned a valid + * small immediate field, not ~0 for failure. + */ + assert(src[i].addr <= 47); +#endif + break; + + case QFILE_VPM: + assert((int)qinst->src[i].index >= + last_vpm_read_index); + (void)last_vpm_read_index; + last_vpm_read_index = qinst->src[i].index; + + temp = new_qpu_nop_before(qinst); + temp->qpu.sig.ldvpm = true; + + src[i] = qpu_acc(3); + break; + + case QFILE_TLB: + case QFILE_TLBU: + unreachable("bad vir src file"); + } + } + + struct qpu_reg dst; + switch (qinst->dst.file) { + case QFILE_NULL: + dst = qpu_magic(V3D_QPU_WADDR_NOP); + break; + + case QFILE_REG: + dst = qpu_reg(qinst->dst.index); + break; + + case QFILE_MAGIC: + dst = qpu_magic(qinst->dst.index); + break; + + case QFILE_TEMP: + dst = temp_registers[qinst->dst.index]; + break; + + case QFILE_VPM: + dst = qpu_magic(V3D_QPU_WADDR_VPM); + break; + + case QFILE_TLB: + dst = qpu_magic(V3D_QPU_WADDR_TLB); + break; + + case QFILE_TLBU: + dst = qpu_magic(V3D_QPU_WADDR_TLBU); + break; + + case QFILE_VARY: + case QFILE_UNIF: + case QFILE_SMALL_IMM: + case QFILE_LOAD_IMM: + assert(!"not reached"); + break; + } + + if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { + if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + if (nsrc >= 1) { + set_src(&qinst->qpu, + &qinst->qpu.alu.add.a, src[0]); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, + &qinst->qpu.alu.add.b, src[1]); + } + + qinst->qpu.alu.add.waddr = dst.index; + qinst->qpu.alu.add.magic_write = dst.magic; + } else { + if (nsrc >= 1) { + set_src(&qinst->qpu, + &qinst->qpu.alu.mul.a, src[0]); + } + if (nsrc >= 2) { + set_src(&qinst->qpu, + &qinst->qpu.alu.mul.b, src[1]); + } + + qinst->qpu.alu.mul.waddr = dst.index; + qinst->qpu.alu.mul.magic_write = dst.magic; + } + } else { + assert(qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); + } + } +} + + +static void +v3d_dump_qpu(struct v3d_compile *c) +{ + fprintf(stderr, "%s prog %d/%d QPU:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + + for (int i = 0; i < c->qpu_inst_count; i++) { + const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]); + fprintf(stderr, "0x%016"PRIx64" %s\n", c->qpu_insts[i], str); + } + fprintf(stderr, "\n"); +} + +void +v3d_vir_to_qpu(struct v3d_compile *c) +{ + struct qpu_reg *temp_registers = v3d_register_allocate(c); + struct qblock *end_block = list_last_entry(&c->blocks, + struct qblock, link); + + /* Reset the uniform count to how many will be actually loaded by the + * generated QPU code. + */ + c->num_uniforms = 0; + + vir_for_each_block(block, c) + v3d_generate_code_block(c, block, temp_registers); + + struct qinst *thrsw = vir_nop(); + list_addtail(&thrsw->link, &end_block->instructions); + thrsw->qpu.sig.thrsw = true; + + uint32_t cycles = v3d_qpu_schedule_instructions(c); + + c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count); + int i = 0; + vir_for_each_inst_inorder(inst, c) { + bool ok = v3d_qpu_instr_pack(c->devinfo, &inst->qpu, + &c->qpu_insts[i++]); + assert(ok); (void) ok; + } + assert(i == c->qpu_inst_count); + + if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n", + vir_get_stage_name(c), + c->program_id, c->variant_id, + cycles); + } + + if (V3D_DEBUG & (V3D_DEBUG_QPU | + v3d_debug_flag_for_shader_stage(c->s->stage))) { + v3d_dump_qpu(c); + } + + qpu_validate(c); + + free(temp_registers); +}