-I$(top_srcdir)/src \
-I$(top_srcdir)/src/broadcom/ \
-I$(top_srcdir)/src/broadcom/include \
+ -I$(top_srcdir)/src/gallium/auxiliary \
+ -I$(top_srcdir)/src/gallium/include \
$(VALGRIND_CFLAGS) \
$(DEFINES)
clif/clif_dump.c \
clif/clif_dump.h \
common/v3d_device_info.h \
+ compiler/nir_to_vir.c \
+ compiler/vir.c \
+ compiler/vir_dump.c \
+ compiler/vir_live_variables.c \
+ compiler/vir_lower_uniforms.c \
+ compiler/vir_opt_copy_propagate.c \
+ compiler/vir_opt_dead_code.c \
+ compiler/vir_register_allocate.c \
+ compiler/vir_to_qpu.c \
+ compiler/qpu_schedule.c \
+ compiler/qpu_validate.c \
+ compiler/v3d_compiler.h \
+ compiler/v3d_nir_lower_io.c \
qpu/qpu_disasm.c \
qpu/qpu_disasm.h \
qpu/qpu_instr.c \
LDADD = \
libbroadcom.la \
+ $(top_builddir)/src/compiler/nir/libnir.la \
$(top_builddir)/src/util/libmesautil.la \
$(NULL)
--- /dev/null
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/ralloc.h"
+#include "util/hash_table.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "v3d_compiler.h"
+
+/* We don't do any address packing. */
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#include "cle/v3d_packet_v33_pack.h"
+
+static struct qreg
+ntq_get_src(struct v3d_compile *c, nir_src src, int i);
+static void
+ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
+
+static void
+resize_qreg_array(struct v3d_compile *c,
+ struct qreg **regs,
+ uint32_t *size,
+ uint32_t decl_size)
+{
+ if (*size >= decl_size)
+ return;
+
+ uint32_t old_size = *size;
+ *size = MAX2(*size * 2, decl_size);
+ *regs = reralloc(c, *regs, struct qreg, *size);
+ if (!*regs) {
+ fprintf(stderr, "Malloc failure\n");
+ abort();
+ }
+
+ for (uint32_t i = old_size; i < *size; i++)
+ (*regs)[i] = c->undef;
+}
+
+static struct qreg
+vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
+{
+ vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
+ return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
+}
+
+static struct qreg
+vir_LDTMU(struct v3d_compile *c)
+{
+ vir_NOP(c)->qpu.sig.ldtmu = true;
+ return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
+}
+
+static struct qreg
+indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
+{
+ struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+ uint32_t offset = nir_intrinsic_base(intr);
+ struct v3d_ubo_range *range = NULL;
+ unsigned i;
+
+ for (i = 0; i < c->num_ubo_ranges; i++) {
+ range = &c->ubo_ranges[i];
+ if (offset >= range->src_offset &&
+ offset < range->src_offset + range->size) {
+ break;
+ }
+ }
+ /* The driver-location-based offset always has to be within a declared
+ * uniform range.
+ */
+ assert(i != c->num_ubo_ranges);
+ if (!c->ubo_range_used[i]) {
+ c->ubo_range_used[i] = true;
+ range->dst_offset = c->next_ubo_dst_offset;
+ c->next_ubo_dst_offset += range->size;
+ }
+
+ offset -= range->src_offset;
+
+ if (range->dst_offset + offset != 0) {
+ indirect_offset = vir_ADD(c, indirect_offset,
+ vir_uniform_ui(c, range->dst_offset +
+ offset));
+ }
+
+ /* Adjust for where we stored the TGSI register base. */
+ vir_ADD_dest(c,
+ vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
+ vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
+ indirect_offset);
+
+ return vir_LDTMU(c);
+}
+
+static struct qreg *
+ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+{
+ struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+ def->num_components);
+ _mesa_hash_table_insert(c->def_ht, def, qregs);
+ return qregs;
+}
+
+/**
+ * This function is responsible for getting VIR results into the associated
+ * storage for a NIR instruction.
+ *
+ * If it's a NIR SSA def, then we just set the associated hash table entry to
+ * the new result.
+ *
+ * If it's a NIR reg, then we need to update the existing qreg assigned to the
+ * NIR destination with the incoming value. To do that without introducing
+ * new MOVs, we require that the incoming qreg either be a uniform, or be
+ * SSA-defined by the previous VIR instruction in the block and rewritable by
+ * this function. That lets us sneak ahead and insert the SF flag beforehand
+ * (knowing that the previous instruction doesn't depend on flags) and rewrite
+ * its destination to be the NIR reg's destination
+ */
+static void
+ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
+ struct qreg result)
+{
+ struct qinst *last_inst = NULL;
+ if (!list_empty(&c->cur_block->instructions))
+ last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+ assert(result.file == QFILE_UNIF ||
+ (result.file == QFILE_TEMP &&
+ last_inst && last_inst == c->defs[result.index]));
+
+ if (dest->is_ssa) {
+ assert(chan < dest->ssa.num_components);
+
+ struct qreg *qregs;
+ struct hash_entry *entry =
+ _mesa_hash_table_search(c->def_ht, &dest->ssa);
+
+ if (entry)
+ qregs = entry->data;
+ else
+ qregs = ntq_init_ssa_def(c, &dest->ssa);
+
+ qregs[chan] = result;
+ } else {
+ nir_register *reg = dest->reg.reg;
+ assert(dest->reg.base_offset == 0);
+ assert(reg->num_array_elems == 0);
+ struct hash_entry *entry =
+ _mesa_hash_table_search(c->def_ht, reg);
+ struct qreg *qregs = entry->data;
+
+ /* Insert a MOV if the source wasn't an SSA def in the
+ * previous instruction.
+ */
+ if (result.file == QFILE_UNIF) {
+ result = vir_MOV(c, result);
+ last_inst = c->defs[result.index];
+ }
+
+ /* We know they're both temps, so just rewrite index. */
+ c->defs[last_inst->dst.index] = NULL;
+ last_inst->dst.index = qregs[chan].index;
+
+ /* If we're in control flow, then make this update of the reg
+ * conditional on the execution mask.
+ */
+ if (c->execute.file != QFILE_NULL) {
+ last_inst->dst.index = qregs[chan].index;
+
+ /* Set the flags to the current exec mask. To insert
+ * the flags push, we temporarily remove our SSA
+ * instruction.
+ */
+ list_del(&last_inst->link);
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ list_addtail(&last_inst->link,
+ &c->cur_block->instructions);
+
+ vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+ last_inst->cond_is_exec_mask = true;
+ }
+ }
+}
+
+static struct qreg
+ntq_get_src(struct v3d_compile *c, nir_src src, int i)
+{
+ struct hash_entry *entry;
+ if (src.is_ssa) {
+ entry = _mesa_hash_table_search(c->def_ht, src.ssa);
+ assert(i < src.ssa->num_components);
+ } else {
+ nir_register *reg = src.reg.reg;
+ entry = _mesa_hash_table_search(c->def_ht, reg);
+ assert(reg->num_array_elems == 0);
+ assert(src.reg.base_offset == 0);
+ assert(i < reg->num_components);
+ }
+
+ struct qreg *qregs = entry->data;
+ return qregs[i];
+}
+
+static struct qreg
+ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
+ unsigned src)
+{
+ assert(util_is_power_of_two(instr->dest.write_mask));
+ unsigned chan = ffs(instr->dest.write_mask) - 1;
+ struct qreg r = ntq_get_src(c, instr->src[src].src,
+ instr->src[src].swizzle[chan]);
+
+ assert(!instr->src[src].abs);
+ assert(!instr->src[src].negate);
+
+ return r;
+};
+
+static inline struct qreg
+vir_SAT(struct v3d_compile *c, struct qreg val)
+{
+ return vir_FMAX(c,
+ vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
+ vir_uniform_f(c, 0.0));
+}
+
+static struct qreg
+ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1)
+{
+ vir_MULTOP(c, src0, src1);
+ return vir_UMUL24(c, src0, src1);
+}
+
+static struct qreg
+ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
+{
+ return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
+}
+
+static void
+ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
+{
+ unsigned unit = instr->texture_index;
+ int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
+ int dest_size = nir_tex_instr_dest_size(instr);
+
+ struct qreg lod = c->undef;
+ if (lod_index != -1)
+ lod = ntq_get_src(c, instr->src[lod_index].src, 0);
+
+ for (int i = 0; i < dest_size; i++) {
+ assert(i < 3);
+ enum quniform_contents contents;
+
+ if (instr->is_array && i == dest_size - 1)
+ contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
+ else
+ contents = QUNIFORM_TEXTURE_WIDTH + i;
+
+ struct qreg size = vir_uniform(c, contents, unit);
+
+ switch (instr->sampler_dim) {
+ case GLSL_SAMPLER_DIM_1D:
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_3D:
+ case GLSL_SAMPLER_DIM_CUBE:
+ /* Don't minify the array size. */
+ if (!(instr->is_array && i == dest_size - 1)) {
+ size = ntq_minify(c, size, lod);
+ }
+ break;
+
+ case GLSL_SAMPLER_DIM_RECT:
+ /* There's no LOD field for rects */
+ break;
+
+ default:
+ unreachable("Bad sampler type");
+ }
+
+ ntq_store_dest(c, &instr->dest, i, size);
+ }
+}
+
+static void
+ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+{
+ unsigned unit = instr->texture_index;
+
+ /* Since each texture sampling op requires uploading uniforms to
+ * reference the texture, there's no HW support for texture size and
+ * you just upload uniforms containing the size.
+ */
+ switch (instr->op) {
+ case nir_texop_query_levels:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+ return;
+ case nir_texop_txs:
+ ntq_emit_txs(c, instr);
+ return;
+ default:
+ break;
+ }
+
+ struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
+ V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
+
+ .fetch_sample_mode = instr->op == nir_texop_txf,
+ };
+
+ switch (instr->sampler_dim) {
+ case GLSL_SAMPLER_DIM_1D:
+ if (instr->is_array)
+ p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
+ else
+ p0_unpacked.lookup_type = TEXTURE_1D;
+ break;
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_RECT:
+ if (instr->is_array)
+ p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
+ else
+ p0_unpacked.lookup_type = TEXTURE_2D;
+ break;
+ case GLSL_SAMPLER_DIM_3D:
+ p0_unpacked.lookup_type = TEXTURE_3D;
+ break;
+ case GLSL_SAMPLER_DIM_CUBE:
+ p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
+ break;
+ default:
+ unreachable("Bad sampler type");
+ }
+
+ struct qreg coords[5];
+ int next_coord = 0;
+ for (unsigned i = 0; i < instr->num_srcs; i++) {
+ switch (instr->src[i].src_type) {
+ case nir_tex_src_coord:
+ for (int j = 0; j < instr->coord_components; j++) {
+ coords[next_coord++] =
+ ntq_get_src(c, instr->src[i].src, j);
+ }
+ if (instr->coord_components < 2)
+ coords[next_coord++] = vir_uniform_f(c, 0.5);
+ break;
+ case nir_tex_src_bias:
+ coords[next_coord++] =
+ ntq_get_src(c, instr->src[i].src, 0);
+
+ p0_unpacked.bias_supplied = true;
+ break;
+ case nir_tex_src_lod:
+ /* XXX: Needs base level addition */
+ coords[next_coord++] =
+ ntq_get_src(c, instr->src[i].src, 0);
+
+ if (instr->op != nir_texop_txf &&
+ instr->op != nir_texop_tg4) {
+ p0_unpacked.disable_autolod_use_bias_only = true;
+ }
+ break;
+ case nir_tex_src_comparator:
+ coords[next_coord++] =
+ ntq_get_src(c, instr->src[i].src, 0);
+
+ p0_unpacked.shadow = true;
+ break;
+
+ case nir_tex_src_offset: {
+ nir_const_value *offset =
+ nir_src_as_const_value(instr->src[i].src);
+ p0_unpacked.texel_offset_for_s_coordinate =
+ offset->i32[0];
+
+ if (instr->coord_components >= 2)
+ p0_unpacked.texel_offset_for_t_coordinate =
+ offset->i32[1];
+
+ if (instr->coord_components >= 3)
+ p0_unpacked.texel_offset_for_r_coordinate =
+ offset->i32[2];
+ break;
+ }
+
+ default:
+ unreachable("unknown texture source");
+ }
+ }
+
+ uint32_t p0_packed;
+ V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
+ (uint8_t *)&p0_packed,
+ &p0_unpacked);
+
+ /* There is no native support for GL texture rectangle coordinates, so
+ * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
+ * 1]).
+ */
+ if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
+ coords[0] = vir_FMUL(c, coords[0],
+ vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
+ unit));
+ coords[1] = vir_FMUL(c, coords[1],
+ vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
+ unit));
+ }
+
+ struct qreg texture_u[] = {
+ vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
+ vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
+ };
+ uint32_t next_texture_u = 0;
+
+ for (int i = 0; i < next_coord; i++) {
+ struct qreg dst;
+
+ if (i == next_coord - 1)
+ dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
+ else
+ dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
+
+ struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
+
+ if (i < 2) {
+ tmu->has_implicit_uniform = true;
+ tmu->src[vir_get_implicit_uniform_src(tmu)] =
+ texture_u[next_texture_u++];
+ }
+ }
+
+ bool return_16 = (c->key->tex[unit].return_size == 16 ||
+ p0_unpacked.shadow);
+
+ struct qreg return_values[4];
+ for (int i = 0; i < c->key->tex[unit].return_channels; i++)
+ return_values[i] = vir_LDTMU(c);
+ /* Swizzling .zw of an RG texture should give undefined results, not
+ * crash the compiler.
+ */
+ for (int i = c->key->tex[unit].return_channels; i < 4; i++)
+ return_values[i] = c->undef;
+
+ for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) {
+ struct qreg chan;
+
+ if (return_16) {
+ STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
+ chan = return_values[i / 2];
+
+ enum v3d_qpu_input_unpack unpack;
+ if (i & 1)
+ unpack = V3D_QPU_UNPACK_H;
+ else
+ unpack = V3D_QPU_UNPACK_L;
+
+ chan = vir_FMOV(c, chan);
+ vir_set_unpack(c->defs[chan.index], 0, unpack);
+ } else {
+ chan = vir_MOV(c, return_values[i]);
+ }
+ ntq_store_dest(c, &instr->dest, i, chan);
+ }
+}
+
+static struct qreg
+ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
+{
+ struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
+ if (is_cos)
+ input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
+
+ struct qreg periods = vir_FROUND(c, input);
+ struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
+ vir_FSUB(c, input, periods));
+ return vir_XOR(c, sin_output, vir_SHL(c,
+ vir_FTOIN(c, periods),
+ vir_uniform_ui(c, -1)));
+}
+
+static struct qreg
+ntq_fsign(struct v3d_compile *c, struct qreg src)
+{
+ struct qreg t = vir_get_temp(c);
+
+ vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
+ vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
+ vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
+ return vir_MOV(c, t);
+}
+
+static struct qreg
+ntq_isign(struct v3d_compile *c, struct qreg src)
+{
+ struct qreg t = vir_get_temp(c);
+
+ vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
+ vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
+ vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
+ return vir_MOV(c, t);
+}
+
+static void
+emit_fragcoord_input(struct v3d_compile *c, int attr)
+{
+ c->inputs[attr * 4 + 0] = vir_FXCD(c);
+ c->inputs[attr * 4 + 1] = vir_FYCD(c);
+ c->inputs[attr * 4 + 2] = c->payload_z;
+ c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
+ c->payload_w);
+}
+
+static struct qreg
+emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+ uint8_t swizzle)
+{
+ struct qreg vary = vir_reg(QFILE_VARY, ~0);
+ struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+
+ /* For gl_PointCoord input or distance along a line, we'll be called
+ * with no nir_variable, and we don't count toward VPM size so we
+ * don't track an input slot.
+ */
+ if (!var) {
+ return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
+ }
+
+ int i = c->num_inputs++;
+ c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location,
+ swizzle);
+
+ switch (var->data.interpolation) {
+ case INTERP_MODE_NONE:
+ case INTERP_MODE_SMOOTH:
+ if (var->data.centroid) {
+ return vir_FADD(c, vir_FMUL(c, vary,
+ c->payload_w_centroid), r5);
+ } else {
+ return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
+ }
+ case INTERP_MODE_NOPERSPECTIVE:
+ /* C appears after the mov from the varying.
+ XXX: improve ldvary setup.
+ */
+ return vir_FADD(c, vir_MOV(c, vary), r5);
+ case INTERP_MODE_FLAT:
+ BITSET_SET(c->flat_shade_flags, i);
+ vir_MOV_dest(c, c->undef, vary);
+ return vir_MOV(c, r5);
+ default:
+ unreachable("Bad interp mode");
+ }
+}
+
+static void
+emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var)
+{
+ for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
+ c->inputs[attr * 4 + i] =
+ emit_fragment_varying(c, var, i);
+ }
+}
+
+static void
+add_output(struct v3d_compile *c,
+ uint32_t decl_offset,
+ uint8_t slot,
+ uint8_t swizzle)
+{
+ uint32_t old_array_size = c->outputs_array_size;
+ resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
+ decl_offset + 1);
+
+ if (old_array_size != c->outputs_array_size) {
+ c->output_slots = reralloc(c,
+ c->output_slots,
+ struct v3d_varying_slot,
+ c->outputs_array_size);
+ }
+
+ c->output_slots[decl_offset] =
+ v3d_slot_from_slot_and_component(slot, swizzle);
+}
+
+static void
+declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
+{
+ unsigned array_id = c->num_ubo_ranges++;
+ if (array_id >= c->ubo_ranges_array_size) {
+ c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
+ array_id + 1);
+ c->ubo_ranges = reralloc(c, c->ubo_ranges,
+ struct v3d_ubo_range,
+ c->ubo_ranges_array_size);
+ c->ubo_range_used = reralloc(c, c->ubo_range_used,
+ bool,
+ c->ubo_ranges_array_size);
+ }
+
+ c->ubo_ranges[array_id].dst_offset = 0;
+ c->ubo_ranges[array_id].src_offset = start;
+ c->ubo_ranges[array_id].size = size;
+ c->ubo_range_used[array_id] = false;
+}
+
+/**
+ * If compare_instr is a valid comparison instruction, emits the
+ * compare_instr's comparison and returns the sel_instr's return value based
+ * on the compare_instr's result.
+ */
+static bool
+ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
+ nir_alu_instr *compare_instr,
+ nir_alu_instr *sel_instr)
+{
+ struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
+ struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
+ bool cond_invert = false;
+
+ switch (compare_instr->op) {
+ case nir_op_feq:
+ case nir_op_seq:
+ vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+ break;
+ case nir_op_ieq:
+ vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+ break;
+
+ case nir_op_fne:
+ case nir_op_sne:
+ vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+ cond_invert = true;
+ break;
+ case nir_op_ine:
+ vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+ cond_invert = true;
+ break;
+
+ case nir_op_fge:
+ case nir_op_sge:
+ vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
+ break;
+ case nir_op_ige:
+ vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+ cond_invert = true;
+ break;
+ case nir_op_uge:
+ vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+ cond_invert = true;
+ break;
+
+ case nir_op_slt:
+ case nir_op_flt:
+ vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
+ break;
+ case nir_op_ilt:
+ vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+ break;
+ case nir_op_ult:
+ vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+ break;
+
+ default:
+ return false;
+ }
+
+ enum v3d_qpu_cond cond = (cond_invert ?
+ V3D_QPU_COND_IFNA :
+ V3D_QPU_COND_IFA);
+
+ switch (sel_instr->op) {
+ case nir_op_seq:
+ case nir_op_sne:
+ case nir_op_sge:
+ case nir_op_slt:
+ *dest = vir_SEL(c, cond,
+ vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
+ break;
+
+ case nir_op_bcsel:
+ *dest = vir_SEL(c, cond,
+ ntq_get_alu_src(c, sel_instr, 1),
+ ntq_get_alu_src(c, sel_instr, 2));
+ break;
+
+ default:
+ *dest = vir_SEL(c, cond,
+ vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
+ break;
+ }
+
+ /* Make the temporary for nir_store_dest(). */
+ *dest = vir_MOV(c, *dest);
+
+ return true;
+}
+
+/**
+ * Attempts to fold a comparison generating a boolean result into the
+ * condition code for selecting between two values, instead of comparing the
+ * boolean result against 0 to generate the condition code.
+ */
+static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
+ struct qreg *src)
+{
+ if (!instr->src[0].src.is_ssa)
+ goto out;
+ if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+ goto out;
+ nir_alu_instr *compare =
+ nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+ if (!compare)
+ goto out;
+
+ struct qreg dest;
+ if (ntq_emit_comparison(c, &dest, compare, instr))
+ return dest;
+
+out:
+ vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+ return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
+}
+
+
+static void
+ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
+{
+ /* This should always be lowered to ALU operations for V3D. */
+ assert(!instr->dest.saturate);
+
+ /* Vectors are special in that they have non-scalarized writemasks,
+ * and just take the first swizzle channel for each argument in order
+ * into each writemask channel.
+ */
+ if (instr->op == nir_op_vec2 ||
+ instr->op == nir_op_vec3 ||
+ instr->op == nir_op_vec4) {
+ struct qreg srcs[4];
+ for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+ srcs[i] = ntq_get_src(c, instr->src[i].src,
+ instr->src[i].swizzle[0]);
+ for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+ ntq_store_dest(c, &instr->dest.dest, i,
+ vir_MOV(c, srcs[i]));
+ return;
+ }
+
+ /* General case: We can just grab the one used channel per src. */
+ struct qreg src[nir_op_infos[instr->op].num_inputs];
+ for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+ src[i] = ntq_get_alu_src(c, instr, i);
+ }
+
+ struct qreg result;
+
+ switch (instr->op) {
+ case nir_op_fmov:
+ case nir_op_imov:
+ result = vir_MOV(c, src[0]);
+ break;
+ case nir_op_fmul:
+ result = vir_FMUL(c, src[0], src[1]);
+ break;
+ case nir_op_fadd:
+ result = vir_FADD(c, src[0], src[1]);
+ break;
+ case nir_op_fsub:
+ result = vir_FSUB(c, src[0], src[1]);
+ break;
+ case nir_op_fmin:
+ result = vir_FMIN(c, src[0], src[1]);
+ break;
+ case nir_op_fmax:
+ result = vir_FMAX(c, src[0], src[1]);
+ break;
+
+ case nir_op_f2i32:
+ result = vir_FTOIZ(c, src[0]);
+ break;
+ case nir_op_f2u32:
+ result = vir_FTOUZ(c, src[0]);
+ break;
+ case nir_op_i2f32:
+ result = vir_ITOF(c, src[0]);
+ break;
+ case nir_op_u2f32:
+ result = vir_UTOF(c, src[0]);
+ break;
+ case nir_op_b2f:
+ result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
+ break;
+ case nir_op_b2i:
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
+ break;
+ case nir_op_i2b:
+ case nir_op_f2b:
+ vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+ result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
+ vir_uniform_ui(c, ~0),
+ vir_uniform_ui(c, 0)));
+ break;
+
+ case nir_op_iadd:
+ result = vir_ADD(c, src[0], src[1]);
+ break;
+ case nir_op_ushr:
+ result = vir_SHR(c, src[0], src[1]);
+ break;
+ case nir_op_isub:
+ result = vir_SUB(c, src[0], src[1]);
+ break;
+ case nir_op_ishr:
+ result = vir_ASR(c, src[0], src[1]);
+ break;
+ case nir_op_ishl:
+ result = vir_SHL(c, src[0], src[1]);
+ break;
+ case nir_op_imin:
+ result = vir_MIN(c, src[0], src[1]);
+ break;
+ case nir_op_umin:
+ result = vir_UMIN(c, src[0], src[1]);
+ break;
+ case nir_op_imax:
+ result = vir_MAX(c, src[0], src[1]);
+ break;
+ case nir_op_umax:
+ result = vir_UMAX(c, src[0], src[1]);
+ break;
+ case nir_op_iand:
+ result = vir_AND(c, src[0], src[1]);
+ break;
+ case nir_op_ior:
+ result = vir_OR(c, src[0], src[1]);
+ break;
+ case nir_op_ixor:
+ result = vir_XOR(c, src[0], src[1]);
+ break;
+ case nir_op_inot:
+ result = vir_NOT(c, src[0]);
+ break;
+
+ case nir_op_imul:
+ result = ntq_umul(c, src[0], src[1]);
+ break;
+
+ case nir_op_seq:
+ case nir_op_sne:
+ case nir_op_sge:
+ case nir_op_slt:
+ case nir_op_feq:
+ case nir_op_fne:
+ case nir_op_fge:
+ case nir_op_flt:
+ case nir_op_ieq:
+ case nir_op_ine:
+ case nir_op_ige:
+ case nir_op_uge:
+ case nir_op_ilt:
+ case nir_op_ult:
+ if (!ntq_emit_comparison(c, &result, instr, instr)) {
+ fprintf(stderr, "Bad comparison instruction\n");
+ }
+ break;
+
+ case nir_op_bcsel:
+ result = ntq_emit_bcsel(c, instr, src);
+ break;
+ case nir_op_fcsel:
+ vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+ result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
+ src[1], src[2]));
+ break;
+
+ case nir_op_frcp:
+ result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
+ break;
+ case nir_op_frsq:
+ result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
+ break;
+ case nir_op_fexp2:
+ result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
+ break;
+ case nir_op_flog2:
+ result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
+ break;
+
+ case nir_op_fceil:
+ result = vir_FCEIL(c, src[0]);
+ break;
+ case nir_op_ffloor:
+ result = vir_FFLOOR(c, src[0]);
+ break;
+ case nir_op_fround_even:
+ result = vir_FROUND(c, src[0]);
+ break;
+ case nir_op_ftrunc:
+ result = vir_FTRUNC(c, src[0]);
+ break;
+ case nir_op_ffract:
+ result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
+ break;
+
+ case nir_op_fsin:
+ result = ntq_fsincos(c, src[0], false);
+ break;
+ case nir_op_fcos:
+ result = ntq_fsincos(c, src[0], true);
+ break;
+
+ case nir_op_fsign:
+ result = ntq_fsign(c, src[0]);
+ break;
+ case nir_op_isign:
+ result = ntq_isign(c, src[0]);
+ break;
+
+ case nir_op_fabs: {
+ result = vir_FMOV(c, src[0]);
+ vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
+ break;
+ }
+
+ case nir_op_iabs:
+ result = vir_MAX(c, src[0],
+ vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
+ break;
+
+ case nir_op_fddx:
+ case nir_op_fddx_coarse:
+ case nir_op_fddx_fine:
+ result = vir_FDX(c, src[0]);
+ break;
+
+ case nir_op_fddy:
+ case nir_op_fddy_coarse:
+ case nir_op_fddy_fine:
+ result = vir_FDY(c, src[0]);
+ break;
+
+ default:
+ fprintf(stderr, "unknown NIR ALU inst: ");
+ nir_print_instr(&instr->instr, stderr);
+ fprintf(stderr, "\n");
+ abort();
+ }
+
+ /* We have a scalar result, so the instruction should only have a
+ * single channel written to.
+ */
+ assert(util_is_power_of_two(instr->dest.write_mask));
+ ntq_store_dest(c, &instr->dest.dest,
+ ffs(instr->dest.write_mask) - 1, result);
+}
+
+static void
+emit_frag_end(struct v3d_compile *c)
+{
+ uint32_t discard_cond = V3D_QPU_COND_NONE;
+ if (c->s->info.fs.uses_discard) {
+ vir_PF(c, vir_MOV(c, c->discard), V3D_QPU_PF_PUSHZ);
+ discard_cond = V3D_QPU_COND_IFA;
+ }
+
+ /* XXX
+ if (c->output_sample_mask_index != -1) {
+ vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
+ }
+ */
+
+ if (c->output_position_index != -1) {
+ struct qinst *inst = vir_MOV_dest(c,
+ vir_reg(QFILE_TLBU, 0),
+ c->outputs[c->output_position_index]);
+
+ inst->src[vir_get_implicit_uniform_src(inst)] =
+ vir_uniform_ui(c,
+ (1 << 2) | /* per pixel */
+ (2 << 6) /* type */ |
+ 0xffffff00);
+ }
+
+ /* XXX: Performance improvement: Merge Z write and color writes TLB
+ * uniform setup
+ */
+
+ if (c->output_color_var) {
+ nir_variable *var = c->output_color_var;
+ struct qreg *color = &c->outputs[var->data.driver_location * 4];
+ int num_components = glsl_get_vector_elements(var->type);
+ uint32_t conf = ~0;
+ struct qinst *inst;
+
+ assert(num_components != 0);
+ switch (glsl_get_base_type(var->type)) {
+ case GLSL_TYPE_UINT:
+ case GLSL_TYPE_INT:
+ conf = ((1 << 2) | /* per pixel */
+ ((7 - 0) << 3) | /* rt */
+ (1 << 6) /* type */ |
+ (num_components - 1) |
+ 0xffffff00);
+
+
+ inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
+ vir_set_cond(inst, discard_cond);
+ inst->src[vir_get_implicit_uniform_src(inst)] =
+ vir_uniform_ui(c, conf);
+
+ for (int i = 1; i < num_components; i++) {
+ inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
+ color[i]);
+ vir_set_cond(inst, discard_cond);
+ }
+ break;
+
+ default: {
+ struct qreg r = color[0];
+ struct qreg g = color[1];
+ struct qreg b = color[2];
+ struct qreg a = color[3];
+
+ if (c->fs_key->swap_color_rb) {
+ r = color[2];
+ b = color[0];
+ }
+
+ inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
+ vir_set_cond(inst, discard_cond);
+ inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
+ vir_set_cond(inst, discard_cond);
+ break;
+ }
+ }
+ }
+}
+
+static void
+emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w)
+{
+ for (int i = 0; i < 2; i++) {
+ struct qreg coord = c->outputs[c->output_position_index + i];
+ coord = vir_FMUL(c, coord,
+ vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
+ 0));
+ coord = vir_FMUL(c, coord, rcp_w);
+ vir_FTOIN_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM),
+ coord);
+ }
+
+}
+
+static void
+emit_zs_write(struct v3d_compile *c, struct qreg rcp_w)
+{
+ struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
+ struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
+
+ vir_FADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM),
+ vir_FMUL(c, vir_FMUL(c,
+ c->outputs[c->output_position_index + 2],
+ zscale),
+ rcp_w),
+ zoffset);
+}
+
+static void
+emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w)
+{
+ vir_VPM_WRITE(c, rcp_w);
+}
+
+static void
+emit_point_size_write(struct v3d_compile *c)
+{
+ struct qreg point_size;
+
+ if (c->output_point_size_index != -1)
+ point_size = c->outputs[c->output_point_size_index];
+ else
+ point_size = vir_uniform_f(c, 1.0);
+
+ /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
+ * BCM21553).
+ */
+ point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
+
+ vir_VPM_WRITE(c, point_size);
+}
+
+static void
+emit_vpm_write_setup(struct v3d_compile *c)
+{
+ uint32_t packed;
+ struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
+ V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
+
+ .horiz = true,
+ .laned = false,
+ .segs = true,
+ .stride = 1,
+ .size = VPM_SETUP_SIZE_32_BIT,
+ .addr = 0,
+ };
+
+ V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
+ (uint8_t *)&packed,
+ &unpacked);
+ vir_VPMSETUP(c, vir_uniform_ui(c, packed));
+}
+
+static void
+emit_vert_end(struct v3d_compile *c)
+{
+ struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
+ c->outputs[c->output_position_index + 3]);
+
+ emit_vpm_write_setup(c);
+
+ if (c->vs_key->is_coord) {
+ for (int i = 0; i < 4; i++)
+ vir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
+ emit_scaled_viewport_write(c, rcp_w);
+ if (c->vs_key->per_vertex_point_size) {
+ emit_point_size_write(c);
+ /* emit_rcp_wc_write(c, rcp_w); */
+ }
+ /* XXX: Z-only rendering */
+ if (0)
+ emit_zs_write(c, rcp_w);
+ } else {
+ emit_scaled_viewport_write(c, rcp_w);
+ emit_zs_write(c, rcp_w);
+ emit_rcp_wc_write(c, rcp_w);
+ if (c->vs_key->per_vertex_point_size)
+ emit_point_size_write(c);
+ }
+
+ for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+ struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
+ int j;
+
+ for (j = 0; j < c->num_outputs; j++) {
+ struct v3d_varying_slot output = c->output_slots[j];
+
+ if (!memcmp(&input, &output, sizeof(input))) {
+ vir_VPM_WRITE(c, c->outputs[j]);
+ break;
+ }
+ }
+ /* Emit padding if we didn't find a declared VS output for
+ * this FS input.
+ */
+ if (j == c->num_outputs)
+ vir_VPM_WRITE(c, vir_uniform_f(c, 0.0));
+ }
+}
+
+void
+v3d_optimize_nir(struct nir_shader *s)
+{
+ bool progress;
+
+ do {
+ progress = false;
+
+ NIR_PASS_V(s, nir_lower_vars_to_ssa);
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+ NIR_PASS(progress, s, nir_lower_phis_to_scalar);
+ NIR_PASS(progress, s, nir_copy_prop);
+ NIR_PASS(progress, s, nir_opt_remove_phis);
+ NIR_PASS(progress, s, nir_opt_dce);
+ NIR_PASS(progress, s, nir_opt_dead_cf);
+ NIR_PASS(progress, s, nir_opt_cse);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+ NIR_PASS(progress, s, nir_opt_algebraic);
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ NIR_PASS(progress, s, nir_opt_undef);
+ } while (progress);
+}
+
+static int
+driver_location_compare(const void *in_a, const void *in_b)
+{
+ const nir_variable *const *a = in_a;
+ const nir_variable *const *b = in_b;
+
+ return (*a)->data.driver_location - (*b)->data.driver_location;
+}
+
+static struct qreg
+ntq_emit_vpm_read(struct v3d_compile *c,
+ uint32_t *num_components_queued,
+ uint32_t *remaining,
+ uint32_t vpm_index)
+{
+ struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
+
+ if (*num_components_queued != 0) {
+ (*num_components_queued)--;
+ c->num_inputs++;
+ return vir_MOV(c, vpm);
+ }
+
+ uint32_t num_components = MIN2(*remaining, 32);
+
+ struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
+ V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
+
+ .horiz = true,
+ .laned = false,
+ /* If the field is 0, that means a read count of 32. */
+ .num = num_components & 31,
+ .segs = true,
+ .stride = 1,
+ .size = VPM_SETUP_SIZE_32_BIT,
+ .addr = c->num_inputs,
+ };
+
+ uint32_t packed;
+ V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
+ (uint8_t *)&packed,
+ &unpacked);
+ vir_VPMSETUP(c, vir_uniform_ui(c, packed));
+
+ *num_components_queued = num_components - 1;
+ *remaining -= num_components;
+ c->num_inputs++;
+
+ return vir_MOV(c, vpm);
+}
+
+static void
+ntq_setup_inputs(struct v3d_compile *c)
+{
+ unsigned num_entries = 0;
+ unsigned num_components = 0;
+ nir_foreach_variable(var, &c->s->inputs) {
+ num_entries++;
+ num_components += glsl_get_components(var->type);
+ }
+
+ nir_variable *vars[num_entries];
+
+ unsigned i = 0;
+ nir_foreach_variable(var, &c->s->inputs)
+ vars[i++] = var;
+
+ /* Sort the variables so that we emit the input setup in
+ * driver_location order. This is required for VPM reads, whose data
+ * is fetched into the VPM in driver_location (TGSI register index)
+ * order.
+ */
+ qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
+
+ uint32_t vpm_components_queued = 0;
+ if (c->s->stage == MESA_SHADER_VERTEX) {
+ bool uses_iid = c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID);
+ bool uses_vid = c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID);
+
+ num_components += uses_iid;
+ num_components += uses_vid;
+
+ if (uses_iid) {
+ c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
+ &num_components, ~0);
+ }
+
+ if (uses_vid) {
+ c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
+ &num_components, ~0);
+ }
+ }
+
+ for (unsigned i = 0; i < num_entries; i++) {
+ nir_variable *var = vars[i];
+ unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+ unsigned loc = var->data.driver_location;
+
+ assert(array_len == 1);
+ (void)array_len;
+ resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+ (loc + 1) * 4);
+
+ if (c->s->stage == MESA_SHADER_FRAGMENT) {
+ if (var->data.location == VARYING_SLOT_POS) {
+ emit_fragcoord_input(c, loc);
+ } else if (var->data.location == VARYING_SLOT_PNTC ||
+ (var->data.location >= VARYING_SLOT_VAR0 &&
+ (c->fs_key->point_sprite_mask &
+ (1 << (var->data.location -
+ VARYING_SLOT_VAR0))))) {
+ c->inputs[loc * 4 + 0] = c->point_x;
+ c->inputs[loc * 4 + 1] = c->point_y;
+ } else {
+ emit_fragment_input(c, loc, var);
+ }
+ } else {
+ int var_components = glsl_get_components(var->type);
+
+ for (int i = 0; i < var_components; i++) {
+ c->inputs[loc * 4 + i] =
+ ntq_emit_vpm_read(c,
+ &vpm_components_queued,
+ &num_components,
+ loc * 4 + i);
+
+ }
+ c->vattr_sizes[loc] = var_components;
+ }
+ }
+
+ if (c->s->stage == MESA_SHADER_VERTEX) {
+ assert(vpm_components_queued == 0);
+ assert(num_components == 0);
+ }
+}
+
+static void
+ntq_setup_outputs(struct v3d_compile *c)
+{
+ nir_foreach_variable(var, &c->s->outputs) {
+ unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+ unsigned loc = var->data.driver_location * 4;
+
+ assert(array_len == 1);
+ (void)array_len;
+
+ for (int i = 0; i < 4; i++)
+ add_output(c, loc + i, var->data.location, i);
+
+ if (c->s->stage == MESA_SHADER_FRAGMENT) {
+ switch (var->data.location) {
+ case FRAG_RESULT_COLOR:
+ case FRAG_RESULT_DATA0:
+ c->output_color_var = var;
+ break;
+ case FRAG_RESULT_DEPTH:
+ c->output_position_index = loc;
+ break;
+ case FRAG_RESULT_SAMPLE_MASK:
+ c->output_sample_mask_index = loc;
+ break;
+ }
+ } else {
+ switch (var->data.location) {
+ case VARYING_SLOT_POS:
+ c->output_position_index = loc;
+ break;
+ case VARYING_SLOT_PSIZ:
+ c->output_point_size_index = loc;
+ break;
+ }
+ }
+ }
+}
+
+static void
+ntq_setup_uniforms(struct v3d_compile *c)
+{
+ nir_foreach_variable(var, &c->s->uniforms) {
+ uint32_t vec4_count = glsl_count_attribute_slots(var->type,
+ false);
+ unsigned vec4_size = 4 * sizeof(float);
+
+ declare_uniform_range(c, var->data.driver_location * vec4_size,
+ vec4_count * vec4_size);
+
+ }
+}
+
+/**
+ * Sets up the mapping from nir_register to struct qreg *.
+ *
+ * Each nir_register gets a struct qreg per 32-bit component being stored.
+ */
+static void
+ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+{
+ foreach_list_typed(nir_register, nir_reg, node, list) {
+ unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+ struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+ array_len *
+ nir_reg->num_components);
+
+ _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
+
+ for (int i = 0; i < array_len * nir_reg->num_components; i++)
+ qregs[i] = vir_get_temp(c);
+ }
+}
+
+static void
+ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
+{
+ struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+ for (int i = 0; i < instr->def.num_components; i++)
+ qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
+
+ _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
+}
+
+static void
+ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
+{
+ struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+
+ /* VIR needs there to be *some* value, so pick 0 (same as for
+ * ntq_setup_registers().
+ */
+ for (int i = 0; i < instr->def.num_components; i++)
+ qregs[i] = vir_uniform_ui(c, 0);
+}
+
+static void
+ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ nir_const_value *const_offset;
+ unsigned offset;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_uniform:
+ assert(instr->num_components == 1);
+ const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+ assert(offset % 4 == 0);
+ /* We need dwords */
+ offset = offset / 4;
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_UNIFORM,
+ offset));
+ } else {
+ ntq_store_dest(c, &instr->dest, 0,
+ indirect_uniform_load(c, instr));
+ }
+ break;
+
+ case nir_intrinsic_load_ubo:
+ for (int i = 0; i < instr->num_components; i++) {
+ int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
+
+ /* Adjust for where we stored the TGSI register base. */
+ vir_ADD_dest(c,
+ vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
+ vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
+ vir_ADD(c,
+ ntq_get_src(c, instr->src[1], 0),
+ vir_uniform_ui(c, i * 4)));
+
+ ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
+ }
+ break;
+
+ const_offset = nir_src_as_const_value(instr->src[0]);
+ if (const_offset) {
+ offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+ assert(offset % 4 == 0);
+ /* We need dwords */
+ offset = offset / 4;
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_UNIFORM,
+ offset));
+ } else {
+ ntq_store_dest(c, &instr->dest, 0,
+ indirect_uniform_load(c, instr));
+ }
+ break;
+
+ case nir_intrinsic_load_user_clip_plane:
+ for (int i = 0; i < instr->num_components; i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+ nir_intrinsic_ucp_id(instr) *
+ 4 + i));
+ }
+ break;
+
+ case nir_intrinsic_load_alpha_ref_float:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
+ break;
+
+ case nir_intrinsic_load_sample_mask_in:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
+ break;
+
+ case nir_intrinsic_load_front_face:
+ /* The register contains 0 (front) or 1 (back), and we need to
+ * turn it into a NIR bool where true means front.
+ */
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_ADD(c,
+ vir_uniform_ui(c, -1),
+ vir_REVF(c)));
+ break;
+
+ case nir_intrinsic_load_instance_id:
+ ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+ break;
+
+ case nir_intrinsic_load_vertex_id:
+ ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+ break;
+
+ case nir_intrinsic_load_input:
+ const_offset = nir_src_as_const_value(instr->src[0]);
+ assert(const_offset && "v3d doesn't support indirect inputs");
+ for (int i = 0; i < instr->num_components; i++) {
+ offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+ int comp = nir_intrinsic_component(instr) + i;
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, c->inputs[offset * 4 + comp]));
+ }
+ break;
+
+ case nir_intrinsic_store_output:
+ const_offset = nir_src_as_const_value(instr->src[1]);
+ assert(const_offset && "v3d doesn't support indirect outputs");
+ offset = ((nir_intrinsic_base(instr) +
+ const_offset->u32[0]) * 4 +
+ nir_intrinsic_component(instr));
+
+ for (int i = 0; i < instr->num_components; i++) {
+ c->outputs[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+ c->num_outputs = MAX2(c->num_outputs,
+ offset + instr->num_components);
+ break;
+
+ case nir_intrinsic_discard:
+ if (c->execute.file != QFILE_NULL) {
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->discard,
+ vir_uniform_ui(c, ~0));
+ } else {
+ vir_MOV_dest(c, c->discard, vir_uniform_ui(c, ~0));
+ }
+ break;
+
+ case nir_intrinsic_discard_if: {
+ /* true (~0) if we're discarding */
+ struct qreg cond = ntq_get_src(c, instr->src[0], 0);
+
+ if (c->execute.file != QFILE_NULL) {
+ /* execute == 0 means the channel is active. Invert
+ * the condition so that we can use zero as "executing
+ * and discarding."
+ */
+ vir_PF(c, vir_AND(c, c->execute, vir_NOT(c, cond)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->discard, cond);
+ } else {
+ vir_OR_dest(c, c->discard, c->discard, cond);
+ }
+
+ break;
+ }
+
+ default:
+ fprintf(stderr, "Unknown intrinsic: ");
+ nir_print_instr(&instr->instr, stderr);
+ fprintf(stderr, "\n");
+ break;
+ }
+}
+
+/* Clears (activates) the execute flags for any channels whose jump target
+ * matches this block.
+ */
+static void
+ntq_activate_execute_for_block(struct v3d_compile *c)
+{
+ vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
+ V3D_QPU_PF_PUSHZ);
+
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
+}
+
+static void
+ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
+{
+ nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
+ bool empty_else_block =
+ (nir_else_block == nir_if_last_else_block(if_stmt) &&
+ exec_list_is_empty(&nir_else_block->instr_list));
+
+ struct qblock *then_block = vir_new_block(c);
+ struct qblock *after_block = vir_new_block(c);
+ struct qblock *else_block;
+ if (empty_else_block)
+ else_block = after_block;
+ else
+ else_block = vir_new_block(c);
+
+ bool was_top_level = false;
+ if (c->execute.file == QFILE_NULL) {
+ c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
+ was_top_level = true;
+ }
+
+ /* Set A for executing (execute == 0) and jumping (if->condition ==
+ * 0) channels, and then update execute flags for those to point to
+ * the ELSE block.
+ */
+ vir_PF(c, vir_OR(c,
+ c->execute,
+ ntq_get_src(c, if_stmt->condition, 0)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA,
+ c->execute,
+ vir_uniform_ui(c, else_block->index));
+
+ /* Jump to ELSE if nothing is active for THEN, otherwise fall
+ * through.
+ */
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+ vir_link_blocks(c->cur_block, else_block);
+ vir_link_blocks(c->cur_block, then_block);
+
+ /* Process the THEN block. */
+ vir_set_emit_block(c, then_block);
+ ntq_emit_cf_list(c, &if_stmt->then_list);
+
+ if (!empty_else_block) {
+ /* Handle the end of the THEN block. First, all currently
+ * active channels update their execute flags to point to
+ * ENDIF
+ */
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
+ vir_uniform_ui(c, after_block->index));
+
+ /* If everything points at ENDIF, then jump there immediately. */
+ vir_PF(c, vir_SUB(c, c->execute,
+ vir_uniform_ui(c, after_block->index)),
+ V3D_QPU_PF_PUSHZ);
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+ vir_link_blocks(c->cur_block, after_block);
+ vir_link_blocks(c->cur_block, else_block);
+
+ vir_set_emit_block(c, else_block);
+ ntq_activate_execute_for_block(c);
+ ntq_emit_cf_list(c, &if_stmt->else_list);
+ }
+
+ vir_link_blocks(c->cur_block, after_block);
+
+ vir_set_emit_block(c, after_block);
+ if (was_top_level)
+ c->execute = c->undef;
+ else
+ ntq_activate_execute_for_block(c);
+}
+
+static void
+ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
+{
+ switch (jump->type) {
+ case nir_jump_break:
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
+ vir_uniform_ui(c, c->loop_break_block->index));
+ break;
+
+ case nir_jump_continue:
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
+ vir_uniform_ui(c, c->loop_cont_block->index));
+ break;
+
+ case nir_jump_return:
+ unreachable("All returns shouold be lowered\n");
+ }
+}
+
+static void
+ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_alu:
+ ntq_emit_alu(c, nir_instr_as_alu(instr));
+ break;
+
+ case nir_instr_type_intrinsic:
+ ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
+ break;
+
+ case nir_instr_type_load_const:
+ ntq_emit_load_const(c, nir_instr_as_load_const(instr));
+ break;
+
+ case nir_instr_type_ssa_undef:
+ ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+ break;
+
+ case nir_instr_type_tex:
+ ntq_emit_tex(c, nir_instr_as_tex(instr));
+ break;
+
+ case nir_instr_type_jump:
+ ntq_emit_jump(c, nir_instr_as_jump(instr));
+ break;
+
+ default:
+ fprintf(stderr, "Unknown NIR instr type: ");
+ nir_print_instr(instr, stderr);
+ fprintf(stderr, "\n");
+ abort();
+ }
+}
+
+static void
+ntq_emit_block(struct v3d_compile *c, nir_block *block)
+{
+ nir_foreach_instr(instr, block) {
+ ntq_emit_instr(c, instr);
+ }
+}
+
+static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
+
+static void
+ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
+{
+ bool was_top_level = false;
+ if (c->execute.file == QFILE_NULL) {
+ c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
+ was_top_level = true;
+ }
+
+ struct qblock *save_loop_cont_block = c->loop_cont_block;
+ struct qblock *save_loop_break_block = c->loop_break_block;
+
+ c->loop_cont_block = vir_new_block(c);
+ c->loop_break_block = vir_new_block(c);
+
+ vir_link_blocks(c->cur_block, c->loop_cont_block);
+ vir_set_emit_block(c, c->loop_cont_block);
+ ntq_activate_execute_for_block(c);
+
+ ntq_emit_cf_list(c, &loop->body);
+
+ /* Re-enable any previous continues now, so our ANYA check below
+ * works.
+ *
+ * XXX: Use the .ORZ flags update, instead.
+ */
+ vir_PF(c, vir_SUB(c,
+ c->execute,
+ vir_uniform_ui(c, c->loop_cont_block->index)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
+
+ vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
+ vir_link_blocks(c->cur_block, c->loop_cont_block);
+ vir_link_blocks(c->cur_block, c->loop_break_block);
+
+ vir_set_emit_block(c, c->loop_break_block);
+ if (was_top_level)
+ c->execute = c->undef;
+ else
+ ntq_activate_execute_for_block(c);
+
+ c->loop_break_block = save_loop_break_block;
+ c->loop_cont_block = save_loop_cont_block;
+}
+
+static void
+ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
+{
+ fprintf(stderr, "FUNCTIONS not handled.\n");
+ abort();
+}
+
+static void
+ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
+{
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_block:
+ ntq_emit_block(c, nir_cf_node_as_block(node));
+ break;
+
+ case nir_cf_node_if:
+ ntq_emit_if(c, nir_cf_node_as_if(node));
+ break;
+
+ case nir_cf_node_loop:
+ ntq_emit_loop(c, nir_cf_node_as_loop(node));
+ break;
+
+ case nir_cf_node_function:
+ ntq_emit_function(c, nir_cf_node_as_function(node));
+ break;
+
+ default:
+ fprintf(stderr, "Unknown NIR node type\n");
+ abort();
+ }
+ }
+}
+
+static void
+ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
+{
+ ntq_setup_registers(c, &impl->registers);
+ ntq_emit_cf_list(c, &impl->body);
+}
+
+static void
+nir_to_vir(struct v3d_compile *c)
+{
+ if (c->s->stage == MESA_SHADER_FRAGMENT) {
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
+ c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
+
+ if (c->s->info.fs.uses_discard)
+ c->discard = vir_MOV(c, vir_uniform_ui(c, 0));
+
+ if (c->fs_key->is_points) {
+ c->point_x = emit_fragment_varying(c, NULL, 0);
+ c->point_y = emit_fragment_varying(c, NULL, 0);
+ } else if (c->fs_key->is_lines) {
+ c->line_x = emit_fragment_varying(c, NULL, 0);
+ }
+ }
+
+ ntq_setup_inputs(c);
+ ntq_setup_outputs(c);
+ ntq_setup_uniforms(c);
+ ntq_setup_registers(c, &c->s->registers);
+
+ /* Find the main function and emit the body. */
+ nir_foreach_function(function, c->s) {
+ assert(strcmp(function->name, "main") == 0);
+ assert(function->impl);
+ ntq_emit_impl(c, function->impl);
+ }
+}
+
+const nir_shader_compiler_options v3d_nir_options = {
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .lower_bitfield_insert = true,
+ .lower_bitfield_extract = true,
+ .lower_ffma = true,
+ .lower_flrp32 = true,
+ .lower_fpow = true,
+ .lower_fsat = true,
+ .lower_fsqrt = true,
+ .lower_negate = true,
+ .native_integers = true,
+};
+
+
+#if 0
+static int
+count_nir_instrs(nir_shader *nir)
+{
+ int count = 0;
+ nir_foreach_function(function, nir) {
+ if (!function->impl)
+ continue;
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr(instr, block)
+ count++;
+ }
+ }
+ return count;
+}
+#endif
+
+void
+v3d_nir_to_vir(struct v3d_compile *c)
+{
+ if (V3D_DEBUG & (V3D_DEBUG_NIR |
+ v3d_debug_flag_for_shader_stage(c->s->stage))) {
+ fprintf(stderr, "%s prog %d/%d NIR:\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
+ nir_print_shader(c->s, stderr);
+ }
+
+ nir_to_vir(c);
+
+ switch (c->s->stage) {
+ case MESA_SHADER_FRAGMENT:
+ emit_frag_end(c);
+ break;
+ case MESA_SHADER_VERTEX:
+ emit_vert_end(c);
+ break;
+ default:
+ unreachable("bad stage");
+ }
+
+ if (V3D_DEBUG & (V3D_DEBUG_VIR |
+ v3d_debug_flag_for_shader_stage(c->s->stage))) {
+ fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
+ vir_dump(c);
+ fprintf(stderr, "\n");
+ }
+
+ vir_optimize(c);
+ vir_lower_uniforms(c);
+
+ /* XXX: vir_schedule_instructions(c); */
+
+ if (V3D_DEBUG & (V3D_DEBUG_VIR |
+ v3d_debug_flag_for_shader_stage(c->s->stage))) {
+ fprintf(stderr, "%s prog %d/%d VIR:\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
+ vir_dump(c);
+ fprintf(stderr, "\n");
+ }
+
+ v3d_vir_to_qpu(c);
+}
--- /dev/null
+/*
+ * Copyright © 2010 Intel Corporation
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * The basic model of the list scheduler is to take a basic block, compute a
+ * DAG of the dependencies, and make a list of the DAG heads. Heuristically
+ * pick a DAG head, then put all the children that are now DAG heads into the
+ * list of things to schedule.
+ *
+ * The goal of scheduling here is to pack pairs of operations together in a
+ * single QPU instruction.
+ */
+
+#include "qpu/qpu_disasm.h"
+#include "v3d_compiler.h"
+#include "util/ralloc.h"
+
+static bool debug;
+
+struct schedule_node_child;
+
+struct schedule_node {
+ struct list_head link;
+ struct qinst *inst;
+ struct schedule_node_child *children;
+ uint32_t child_count;
+ uint32_t child_array_size;
+ uint32_t parent_count;
+
+ /* Longest cycles + instruction_latency() of any parent of this node. */
+ uint32_t unblocked_time;
+
+ /**
+ * Minimum number of cycles from scheduling this instruction until the
+ * end of the program, based on the slowest dependency chain through
+ * the children.
+ */
+ uint32_t delay;
+
+ /**
+ * cycles between this instruction being scheduled and when its result
+ * can be consumed.
+ */
+ uint32_t latency;
+};
+
+struct schedule_node_child {
+ struct schedule_node *node;
+ bool write_after_read;
+};
+
+/* When walking the instructions in reverse, we need to swap before/after in
+ * add_dep().
+ */
+enum direction { F, R };
+
+struct schedule_state {
+ struct schedule_node *last_r[6];
+ struct schedule_node *last_rf[64];
+ struct schedule_node *last_sf;
+ struct schedule_node *last_vpm_read;
+ struct schedule_node *last_tmu_write;
+ struct schedule_node *last_tlb;
+ struct schedule_node *last_vpm;
+ struct schedule_node *last_unif;
+ struct schedule_node *last_rtop;
+ enum direction dir;
+ /* Estimated cycle when the current instruction would start. */
+ uint32_t time;
+};
+
+static void
+add_dep(struct schedule_state *state,
+ struct schedule_node *before,
+ struct schedule_node *after,
+ bool write)
+{
+ bool write_after_read = !write && state->dir == R;
+
+ if (!before || !after)
+ return;
+
+ assert(before != after);
+
+ if (state->dir == R) {
+ struct schedule_node *t = before;
+ before = after;
+ after = t;
+ }
+
+ for (int i = 0; i < before->child_count; i++) {
+ if (before->children[i].node == after &&
+ (before->children[i].write_after_read == write_after_read)) {
+ return;
+ }
+ }
+
+ if (before->child_array_size <= before->child_count) {
+ before->child_array_size = MAX2(before->child_array_size * 2, 16);
+ before->children = reralloc(before, before->children,
+ struct schedule_node_child,
+ before->child_array_size);
+ }
+
+ before->children[before->child_count].node = after;
+ before->children[before->child_count].write_after_read =
+ write_after_read;
+ before->child_count++;
+ after->parent_count++;
+}
+
+static void
+add_read_dep(struct schedule_state *state,
+ struct schedule_node *before,
+ struct schedule_node *after)
+{
+ add_dep(state, before, after, false);
+}
+
+static void
+add_write_dep(struct schedule_state *state,
+ struct schedule_node **before,
+ struct schedule_node *after)
+{
+ add_dep(state, *before, after, true);
+ *before = after;
+}
+
+static bool
+qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (inst->alu.add.magic_write &&
+ (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
+ inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
+ return true;
+
+ if (inst->alu.mul.magic_write &&
+ (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
+ inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
+ return true;
+
+ return false;
+}
+
+static void
+process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+ enum v3d_qpu_mux mux)
+{
+ switch (mux) {
+ case V3D_QPU_MUX_A:
+ add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
+ break;
+ case V3D_QPU_MUX_B:
+ add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
+ break;
+ default:
+ add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
+ break;
+ }
+}
+
+
+static void
+process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint32_t waddr, bool magic)
+{
+ if (!magic) {
+ add_write_dep(state, &state->last_rf[waddr], n);
+ } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+ add_write_dep(state, &state->last_tmu_write, n);
+ } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
+ /* Handled by v3d_qpu_writes_r4() check. */
+ } else {
+ switch (waddr) {
+ case V3D_QPU_WADDR_R0:
+ case V3D_QPU_WADDR_R1:
+ case V3D_QPU_WADDR_R2:
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ add_write_dep(state,
+ &state->last_r[waddr - V3D_QPU_WADDR_R0],
+ n);
+ break;
+
+ case V3D_QPU_WADDR_VPM:
+ case V3D_QPU_WADDR_VPMU:
+ add_write_dep(state, &state->last_vpm, n);
+ break;
+
+ case V3D_QPU_WADDR_TLB:
+ case V3D_QPU_WADDR_TLBU:
+ add_write_dep(state, &state->last_tlb, n);
+ break;
+
+ case V3D_QPU_WADDR_NOP:
+ break;
+
+ default:
+ fprintf(stderr, "Unknown waddr %d\n", waddr);
+ abort();
+ }
+ }
+}
+
+static void
+process_cond_deps(struct schedule_state *state, struct schedule_node *n,
+ enum v3d_qpu_cond cond)
+{
+ if (cond != V3D_QPU_COND_NONE)
+ add_read_dep(state, state->last_sf, n);
+}
+
+static void
+process_pf_deps(struct schedule_state *state, struct schedule_node *n,
+ enum v3d_qpu_pf pf)
+{
+ if (pf != V3D_QPU_PF_NONE)
+ add_write_dep(state, &state->last_sf, n);
+}
+
+static void
+process_uf_deps(struct schedule_state *state, struct schedule_node *n,
+ enum v3d_qpu_uf uf)
+{
+ if (uf != V3D_QPU_UF_NONE)
+ add_write_dep(state, &state->last_sf, n);
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all reads of r4 have to happen between the r4
+ * writes that surround them".
+ */
+static void
+calculate_deps(struct schedule_state *state, struct schedule_node *n)
+{
+ struct qinst *qinst = n->inst;
+ struct v3d_qpu_instr *inst = &qinst->qpu;
+
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+ if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
+ add_read_dep(state, state->last_sf, n);
+
+ /* XXX: BDI */
+ /* XXX: BDU */
+ /* XXX: ub */
+ /* XXX: raddr_a */
+
+ add_write_dep(state, &state->last_unif, n);
+ return;
+ }
+
+ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+
+ /* XXX: LOAD_IMM */
+
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
+ process_mux_deps(state, n, inst->alu.add.a);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
+ process_mux_deps(state, n, inst->alu.add.b);
+
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
+ process_mux_deps(state, n, inst->alu.mul.a);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
+ process_mux_deps(state, n, inst->alu.mul.b);
+
+ switch (inst->alu.add.op) {
+ case V3D_QPU_A_VPMSETUP:
+ /* Could distinguish read/write by unpacking the uniform. */
+ add_write_dep(state, &state->last_vpm, n);
+ add_write_dep(state, &state->last_vpm_read, n);
+ break;
+
+ case V3D_QPU_A_STVPMV:
+ case V3D_QPU_A_STVPMD:
+ case V3D_QPU_A_STVPMP:
+ add_write_dep(state, &state->last_vpm, n);
+ break;
+
+ case V3D_QPU_A_MSF:
+ add_read_dep(state, state->last_tlb, n);
+ break;
+
+ case V3D_QPU_A_SETMSF:
+ case V3D_QPU_A_SETREVF:
+ add_write_dep(state, &state->last_tlb, n);
+ break;
+
+ case V3D_QPU_A_FLAPUSH:
+ case V3D_QPU_A_FLBPUSH:
+ case V3D_QPU_A_VFLA:
+ case V3D_QPU_A_VFLNA:
+ case V3D_QPU_A_VFLB:
+ case V3D_QPU_A_VFLNB:
+ add_read_dep(state, state->last_sf, n);
+ break;
+
+ case V3D_QPU_A_FLBPOP:
+ add_write_dep(state, &state->last_sf, n);
+ break;
+
+ default:
+ break;
+ }
+
+ switch (inst->alu.mul.op) {
+ case V3D_QPU_M_MULTOP:
+ case V3D_QPU_M_UMUL24:
+ /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
+ * resets it to 0. We could possibly reorder umul24s relative
+ * to each other, but for now just keep all the MUL parts in
+ * order.
+ */
+ add_write_dep(state, &state->last_rtop, n);
+ break;
+ default:
+ break;
+ }
+
+ if (inst->alu.add.op != V3D_QPU_A_NOP) {
+ process_waddr_deps(state, n, inst->alu.add.waddr,
+ inst->alu.add.magic_write);
+ }
+ if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+ process_waddr_deps(state, n, inst->alu.mul.waddr,
+ inst->alu.mul.magic_write);
+ }
+
+ if (v3d_qpu_writes_r3(inst))
+ add_write_dep(state, &state->last_r[3], n);
+ if (v3d_qpu_writes_r4(inst))
+ add_write_dep(state, &state->last_r[4], n);
+ if (v3d_qpu_writes_r5(inst))
+ add_write_dep(state, &state->last_r[5], n);
+
+ if (inst->sig.thrsw) {
+ /* All accumulator contents and flags are undefined after the
+ * switch.
+ */
+ for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
+ add_write_dep(state, &state->last_r[i], n);
+ add_write_dep(state, &state->last_sf, n);
+
+ /* Scoreboard-locking operations have to stay after the last
+ * thread switch.
+ */
+ add_write_dep(state, &state->last_tlb, n);
+
+ add_write_dep(state, &state->last_tmu_write, n);
+ }
+
+ if (inst->sig.ldtmu) {
+ /* TMU loads are coming from a FIFO, so ordering is important.
+ */
+ add_write_dep(state, &state->last_tmu_write, n);
+ }
+
+ if (inst->sig.ldtlb | inst->sig.ldtlbu)
+ add_read_dep(state, state->last_tlb, n);
+
+ if (inst->sig.ldvpm)
+ add_write_dep(state, &state->last_vpm_read, n);
+
+ /* inst->sig.ldunif or sideband uniform read */
+ if (qinst->uniform != ~0)
+ add_write_dep(state, &state->last_unif, n);
+
+ process_cond_deps(state, n, inst->flags.ac);
+ process_cond_deps(state, n, inst->flags.mc);
+ process_pf_deps(state, n, inst->flags.apf);
+ process_pf_deps(state, n, inst->flags.mpf);
+ process_uf_deps(state, n, inst->flags.auf);
+ process_uf_deps(state, n, inst->flags.muf);
+}
+
+static void
+calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
+{
+ struct schedule_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.dir = F;
+
+ list_for_each_entry(struct schedule_node, node, schedule_list, link)
+ calculate_deps(&state, node);
+}
+
+static void
+calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
+{
+ struct list_head *node;
+ struct schedule_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.dir = R;
+
+ for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
+ calculate_deps(&state, (struct schedule_node *)node);
+ }
+}
+
+struct choose_scoreboard {
+ int tick;
+ int last_sfu_write_tick;
+ int last_ldvary_tick;
+ int last_uniforms_reset_tick;
+ uint32_t last_waddr_add, last_waddr_mul;
+ bool tlb_locked;
+};
+
+static bool
+mux_reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
+{
+ switch (mux) {
+ case V3D_QPU_MUX_A:
+ if (scoreboard->last_waddr_add == inst->raddr_a ||
+ scoreboard->last_waddr_mul == inst->raddr_a) {
+ return true;
+ }
+ break;
+
+ case V3D_QPU_MUX_B:
+ if (scoreboard->last_waddr_add == inst->raddr_b ||
+ scoreboard->last_waddr_mul == inst->raddr_b) {
+ return true;
+ }
+ break;
+
+ case V3D_QPU_MUX_R4:
+ if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
+ return true;
+ break;
+
+ case V3D_QPU_MUX_R5:
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+ struct qinst *qinst)
+{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+ /* XXX: Branching off of raddr. */
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
+ return false;
+
+ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+
+ if (inst->alu.add.op != V3D_QPU_A_NOP) {
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
+ mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
+ return true;
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
+ mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
+ return true;
+ }
+ }
+
+ if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
+ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
+ return true;
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
+ mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
+ return true;
+ }
+ }
+
+ /* XXX: imm */
+
+ return false;
+}
+
+static bool
+writes_too_soon_after_write(struct choose_scoreboard *scoreboard,
+ struct qinst *qinst)
+{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+ /* Don't schedule any other r4 write too soon after an SFU write.
+ * This would normally be prevented by dependency tracking, but might
+ * occur if a dead SFU computation makes it to scheduling.
+ */
+ if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
+ v3d_qpu_writes_r4(inst))
+ return true;
+
+ return false;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
+{
+ return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+}
+
+static int
+get_instruction_priority(const struct v3d_qpu_instr *inst)
+{
+ uint32_t baseline_score;
+ uint32_t next_score = 0;
+
+ /* Schedule TLB operations as late as possible, to get more
+ * parallelism between shaders.
+ */
+ if (qpu_inst_is_tlb(inst))
+ return next_score;
+ next_score++;
+
+ /* Schedule texture read results collection late to hide latency. */
+ if (inst->sig.ldtmu)
+ return next_score;
+ next_score++;
+
+ /* Default score for things that aren't otherwise special. */
+ baseline_score = next_score;
+ next_score++;
+
+ /* Schedule texture read setup early to hide their latency better. */
+ if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
+ ((inst->alu.add.magic_write &&
+ v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
+ (inst->alu.mul.magic_write &&
+ v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
+ return next_score;
+ }
+ next_score++;
+
+ return baseline_score;
+}
+
+static bool
+qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
+{
+ return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
+ v3d_qpu_magic_waddr_is_sfu(waddr) ||
+ v3d_qpu_magic_waddr_is_tlb(waddr) ||
+ v3d_qpu_magic_waddr_is_vpm(waddr) ||
+ v3d_qpu_magic_waddr_is_tsy(waddr));
+}
+
+static bool
+qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
+{
+ if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
+ qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
+ return true;
+ }
+
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
+ qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
+ return true;
+ }
+ }
+
+ return (inst->sig.ldvpm ||
+ inst->sig.ldtmu ||
+ inst->sig.ldtlb ||
+ inst->sig.ldtlbu);
+}
+
+static bool
+qpu_merge_inst(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *result,
+ const struct v3d_qpu_instr *a,
+ const struct v3d_qpu_instr *b)
+{
+ if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
+ b->type != V3D_QPU_INSTR_TYPE_ALU) {
+ return false;
+ }
+
+ /* Can't do more than one peripheral access in an instruction. */
+ if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
+ return false;
+
+ struct v3d_qpu_instr merge = *a;
+
+ if (b->alu.add.op != V3D_QPU_A_NOP) {
+ if (a->alu.add.op != V3D_QPU_A_NOP)
+ return false;
+ merge.alu.add = b->alu.add;
+
+ merge.flags.ac = b->flags.ac;
+ merge.flags.apf = b->flags.apf;
+ merge.flags.auf = b->flags.auf;
+ }
+
+ if (b->alu.mul.op != V3D_QPU_M_NOP) {
+ if (a->alu.mul.op != V3D_QPU_M_NOP)
+ return false;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+ }
+
+ if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
+ if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
+ a->raddr_a != b->raddr_a) {
+ return false;
+ }
+ merge.raddr_a = b->raddr_a;
+ }
+
+ if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
+ if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
+ a->raddr_b != b->raddr_b) {
+ return false;
+ }
+ merge.raddr_b = b->raddr_b;
+ }
+
+ merge.sig.thrsw |= b->sig.thrsw;
+ merge.sig.ldunif |= b->sig.ldunif;
+ merge.sig.ldtmu |= b->sig.ldtmu;
+ merge.sig.ldvary |= b->sig.ldvary;
+ merge.sig.ldvpm |= b->sig.ldvpm;
+ merge.sig.small_imm |= b->sig.small_imm;
+ merge.sig.ldtlb |= b->sig.ldtlb;
+ merge.sig.ldtlbu |= b->sig.ldtlbu;
+ merge.sig.ucb |= b->sig.ucb;
+ merge.sig.rotate |= b->sig.rotate;
+ merge.sig.wrtmuc |= b->sig.wrtmuc;
+
+ uint64_t packed;
+ bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
+
+ *result = merge;
+ /* No modifying the real instructions on failure. */
+ assert(ok || (a != result && b != result));
+
+ return ok;
+}
+
+static struct schedule_node *
+choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ struct list_head *schedule_list,
+ struct schedule_node *prev_inst)
+{
+ struct schedule_node *chosen = NULL;
+ int chosen_prio = 0;
+
+ /* Don't pair up anything with a thread switch signal -- emit_thrsw()
+ * will handle pairing it along with filling the delay slots.
+ */
+ if (prev_inst) {
+ if (prev_inst->inst->qpu.sig.thrsw)
+ return NULL;
+ }
+
+ list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+ const struct v3d_qpu_instr *inst = &n->inst->qpu;
+
+ /* Don't choose the branch instruction until it's the last one
+ * left. We'll move it up to fit its delay slots after we
+ * choose it.
+ */
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+ !list_is_singular(schedule_list)) {
+ continue;
+ }
+
+ /* "An instruction must not read from a location in physical
+ * regfile A or B that was written to by the previous
+ * instruction."
+ */
+ if (reads_too_soon_after_write(scoreboard, n->inst))
+ continue;
+
+ if (writes_too_soon_after_write(scoreboard, n->inst))
+ continue;
+
+ /* "A scoreboard wait must not occur in the first two
+ * instructions of a fragment shader. This is either the
+ * explicit Wait for Scoreboard signal or an implicit wait
+ * with the first tile-buffer read or write instruction."
+ */
+ if (pixel_scoreboard_too_soon(scoreboard, inst))
+ continue;
+
+ /* ldunif and ldvary both write r5, but ldunif does so a tick
+ * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ * otherwise get scheduled so ldunif and ldvary try to update
+ * r5 in the same tick.
+ */
+ if (inst->sig.ldunif &&
+ scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
+ continue;
+ }
+
+ /* If we're trying to pair with another instruction, check
+ * that they're compatible.
+ */
+ if (prev_inst) {
+ /* Don't pair up a thread switch signal -- we'll
+ * handle pairing it when we pick it on its own.
+ */
+ if (inst->sig.thrsw)
+ continue;
+
+ if (prev_inst->inst->uniform != -1 &&
+ n->inst->uniform != -1)
+ continue;
+
+ /* Don't merge in something that will lock the TLB.
+ * Hopwefully what we have in inst will release some
+ * other instructions, allowing us to delay the
+ * TLB-locking instruction until later.
+ */
+ if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+ continue;
+
+ struct v3d_qpu_instr merged_inst;
+ if (!qpu_merge_inst(devinfo, &merged_inst,
+ &prev_inst->inst->qpu, inst)) {
+ continue;
+ }
+ }
+
+ int prio = get_instruction_priority(inst);
+
+ /* Found a valid instruction. If nothing better comes along,
+ * this one works.
+ */
+ if (!chosen) {
+ chosen = n;
+ chosen_prio = prio;
+ continue;
+ }
+
+ if (prio > chosen_prio) {
+ chosen = n;
+ chosen_prio = prio;
+ } else if (prio < chosen_prio) {
+ continue;
+ }
+
+ if (n->delay > chosen->delay) {
+ chosen = n;
+ chosen_prio = prio;
+ } else if (n->delay < chosen->delay) {
+ continue;
+ }
+ }
+
+ return chosen;
+}
+
+static void
+update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
+ enum v3d_qpu_waddr waddr)
+{
+ if (v3d_qpu_magic_waddr_is_sfu(waddr))
+ scoreboard->last_sfu_write_tick = scoreboard->tick;
+}
+
+static void
+update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
+{
+ scoreboard->last_waddr_add = ~0;
+ scoreboard->last_waddr_mul = ~0;
+
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
+ return;
+
+ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+
+ if (inst->alu.add.op != V3D_QPU_A_NOP) {
+ if (inst->alu.add.magic_write) {
+ update_scoreboard_for_magic_waddr(scoreboard,
+ inst->alu.add.waddr);
+ } else {
+ scoreboard->last_waddr_add = inst->alu.add.waddr;
+ }
+ }
+
+ if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+ if (inst->alu.mul.magic_write) {
+ update_scoreboard_for_magic_waddr(scoreboard,
+ inst->alu.mul.waddr);
+ } else {
+ scoreboard->last_waddr_mul = inst->alu.mul.waddr;
+ }
+ }
+
+ if (inst->sig.ldvary)
+ scoreboard->last_ldvary_tick = scoreboard->tick;
+
+ if (qpu_inst_is_tlb(inst))
+ scoreboard->tlb_locked = true;
+}
+
+static void
+dump_state(const struct v3d_device_info *devinfo,
+ struct list_head *schedule_list)
+{
+ list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+ fprintf(stderr, " t=%4d: ", n->unblocked_time);
+ v3d_qpu_dump(devinfo, &n->inst->qpu);
+ fprintf(stderr, "\n");
+
+ for (int i = 0; i < n->child_count; i++) {
+ struct schedule_node *child = n->children[i].node;
+ if (!child)
+ continue;
+
+ fprintf(stderr, " - ");
+ v3d_qpu_dump(devinfo, &child->inst->qpu);
+ fprintf(stderr, " (%d parents, %c)\n",
+ child->parent_count,
+ n->children[i].write_after_read ? 'w' : 'r');
+ }
+ }
+}
+
+static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
+ const struct v3d_qpu_instr *after)
+{
+ /* Apply some huge latency between texture fetch requests and getting
+ * their results back.
+ *
+ * FIXME: This is actually pretty bogus. If we do:
+ *
+ * mov tmu0_s, a
+ * <a bit of math>
+ * mov tmu0_s, b
+ * load_tmu0
+ * <more math>
+ * load_tmu0
+ *
+ * we count that as worse than
+ *
+ * mov tmu0_s, a
+ * mov tmu0_s, b
+ * <lots of math>
+ * load_tmu0
+ * <more math>
+ * load_tmu0
+ *
+ * because we associate the first load_tmu0 with the *second* tmu0_s.
+ */
+ if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
+ return 100;
+
+ /* Assume that anything depending on us is consuming the SFU result. */
+ if (v3d_qpu_magic_waddr_is_sfu(waddr))
+ return 3;
+
+ return 1;
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+ const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
+ const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
+ uint32_t latency = 1;
+
+ if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
+ after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return latency;
+
+ if (before_inst->alu.add.magic_write) {
+ latency = MAX2(latency,
+ magic_waddr_latency(before_inst->alu.add.waddr,
+ after_inst));
+ }
+
+ if (before_inst->alu.mul.magic_write) {
+ latency = MAX2(latency,
+ magic_waddr_latency(before_inst->alu.mul.waddr,
+ after_inst));
+ }
+
+ return latency;
+}
+
+/** Recursive computation of the delay member of a node. */
+static void
+compute_delay(struct schedule_node *n)
+{
+ if (!n->child_count) {
+ n->delay = 1;
+ } else {
+ for (int i = 0; i < n->child_count; i++) {
+ if (!n->children[i].node->delay)
+ compute_delay(n->children[i].node);
+ n->delay = MAX2(n->delay,
+ n->children[i].node->delay +
+ instruction_latency(n, n->children[i].node));
+ }
+ }
+}
+
+static void
+mark_instruction_scheduled(struct list_head *schedule_list,
+ uint32_t time,
+ struct schedule_node *node,
+ bool war_only)
+{
+ if (!node)
+ return;
+
+ for (int i = node->child_count - 1; i >= 0; i--) {
+ struct schedule_node *child =
+ node->children[i].node;
+
+ if (!child)
+ continue;
+
+ if (war_only && !node->children[i].write_after_read)
+ continue;
+
+ /* If the requirement is only that the node not appear before
+ * the last read of its destination, then it can be scheduled
+ * immediately after (or paired with!) the thing reading the
+ * destination.
+ */
+ uint32_t latency = 0;
+ if (!war_only) {
+ latency = instruction_latency(node,
+ node->children[i].node);
+ }
+
+ child->unblocked_time = MAX2(child->unblocked_time,
+ time + latency);
+ child->parent_count--;
+ if (child->parent_count == 0)
+ list_add(&child->link, schedule_list);
+
+ node->children[i].node = NULL;
+ }
+}
+
+static struct qinst *
+vir_nop()
+{
+ struct qreg undef = { QFILE_NULL, 0 };
+ struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
+
+ return qinst;
+}
+
+#if 0
+static struct qinst *
+nop_after(struct qinst *inst)
+{
+ struct qinst *q = vir_nop();
+
+ list_add(&q->link, &inst->link);
+
+ return q;
+}
+
+/**
+ * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
+ * with another instruction.
+ */
+static void
+emit_thrsw(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
+{
+ /* There should be nothing in a thrsw inst being scheduled other than
+ * the signal bits.
+ */
+ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+ assert(inst->alu.mul.op == V3D_QPU_M_NOP);
+
+ /* Try to find an earlier scheduled instruction that we can merge the
+ * thrsw into.
+ */
+ int thrsw_ip = c->qpu_inst_count;
+ for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
+ uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
+ uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
+
+ if (prev_sig == QPU_SIG_NONE)
+ thrsw_ip = c->qpu_inst_count - i;
+ }
+
+ if (thrsw_ip != c->qpu_inst_count) {
+ /* Merge the thrsw into the existing instruction. */
+ c->qpu_insts[thrsw_ip] =
+ QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
+ } else {
+ qpu_serialize_one_inst(c, inst);
+ update_scoreboard_for_chosen(scoreboard, inst);
+ }
+
+ /* Fill the delay slots. */
+ while (c->qpu_inst_count < thrsw_ip + 3) {
+ update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop());
+ qpu_serialize_one_inst(c, v3d_qpu_nop());
+ }
+}
+#endif
+
+static uint32_t
+schedule_instructions(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
+ struct qblock *block,
+ struct list_head *schedule_list,
+ enum quniform_contents *orig_uniform_contents,
+ uint32_t *orig_uniform_data,
+ uint32_t *next_uniform)
+{
+ const struct v3d_device_info *devinfo = c->devinfo;
+ uint32_t time = 0;
+
+ if (debug) {
+ fprintf(stderr, "initial deps:\n");
+ dump_state(devinfo, schedule_list);
+ fprintf(stderr, "\n");
+ }
+
+ /* Remove non-DAG heads from the list. */
+ list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
+ if (n->parent_count != 0)
+ list_del(&n->link);
+ }
+
+ while (!list_empty(schedule_list)) {
+ struct schedule_node *chosen =
+ choose_instruction_to_schedule(devinfo,
+ scoreboard,
+ schedule_list,
+ NULL);
+ struct schedule_node *merge = NULL;
+
+ /* If there are no valid instructions to schedule, drop a NOP
+ * in.
+ */
+ struct qinst *qinst = chosen ? chosen->inst : vir_nop();
+ struct v3d_qpu_instr *inst = &qinst->qpu;
+
+ if (debug) {
+ fprintf(stderr, "t=%4d: current list:\n",
+ time);
+ dump_state(devinfo, schedule_list);
+ fprintf(stderr, "t=%4d: chose: ", time);
+ v3d_qpu_dump(devinfo, inst);
+ fprintf(stderr, "\n");
+ }
+
+ /* Schedule this instruction onto the QPU list. Also try to
+ * find an instruction to pair with it.
+ */
+ if (chosen) {
+ time = MAX2(chosen->unblocked_time, time);
+ list_del(&chosen->link);
+ mark_instruction_scheduled(schedule_list, time,
+ chosen, true);
+
+ merge = choose_instruction_to_schedule(devinfo,
+ scoreboard,
+ schedule_list,
+ chosen);
+ if (merge) {
+ time = MAX2(merge->unblocked_time, time);
+ list_del(&merge->link);
+ (void)qpu_merge_inst(devinfo, inst,
+ inst, &merge->inst->qpu);
+ if (merge->inst->uniform != -1) {
+ chosen->inst->uniform =
+ merge->inst->uniform;
+ }
+
+ if (debug) {
+ fprintf(stderr, "t=%4d: merging: ",
+ time);
+ v3d_qpu_dump(devinfo, &merge->inst->qpu);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " result: ");
+ v3d_qpu_dump(devinfo, inst);
+ fprintf(stderr, "\n");
+ }
+ }
+ }
+
+ /* Update the uniform index for the rewritten location --
+ * branch target updating will still need to change
+ * c->uniform_data[] using this index.
+ */
+ if (qinst->uniform != -1) {
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
+ block->branch_uniform = *next_uniform;
+
+ c->uniform_data[*next_uniform] =
+ orig_uniform_data[qinst->uniform];
+ c->uniform_contents[*next_uniform] =
+ orig_uniform_contents[qinst->uniform];
+ qinst->uniform = *next_uniform;
+ (*next_uniform)++;
+ }
+
+ if (debug) {
+ fprintf(stderr, "\n");
+ }
+
+ /* Now that we've scheduled a new instruction, some of its
+ * children can be promoted to the list of instructions ready to
+ * be scheduled. Update the children's unblocked time for this
+ * DAG edge as we do so.
+ */
+ mark_instruction_scheduled(schedule_list, time, chosen, false);
+
+ if (merge) {
+ mark_instruction_scheduled(schedule_list, time, merge,
+ false);
+
+ /* The merged VIR instruction doesn't get re-added to the
+ * block, so free it now.
+ */
+ free(merge->inst);
+ }
+
+ if (0 && inst->sig.thrsw) {
+ /* XXX emit_thrsw(c, scoreboard, qinst); */
+ } else {
+ c->qpu_inst_count++;
+ list_addtail(&qinst->link, &block->instructions);
+ update_scoreboard_for_chosen(scoreboard, inst);
+ }
+
+ scoreboard->tick++;
+ time++;
+
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
+ inst->sig.thrsw /* XXX */) {
+ block->branch_qpu_ip = c->qpu_inst_count - 1;
+ /* Fill the delay slots.
+ *
+ * We should fill these with actual instructions,
+ * instead, but that will probably need to be done
+ * after this, once we know what the leading
+ * instructions of the successors are (so we can
+ * handle A/B register file write latency)
+ */
+ /* XXX: scoreboard */
+ int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
+ 3 : 2);
+ for (int i = 0; i < slots; i++) {
+ struct qinst *nop = vir_nop();
+ list_addtail(&nop->link, &block->instructions);
+
+ update_scoreboard_for_chosen(scoreboard,
+ &nop->qpu);
+ c->qpu_inst_count++;
+ scoreboard->tick++;
+ time++;
+ }
+ }
+ }
+
+ return time;
+}
+
+static uint32_t
+qpu_schedule_instructions_block(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
+ struct qblock *block,
+ enum quniform_contents *orig_uniform_contents,
+ uint32_t *orig_uniform_data,
+ uint32_t *next_uniform)
+{
+ void *mem_ctx = ralloc_context(NULL);
+ struct list_head schedule_list;
+
+ list_inithead(&schedule_list);
+
+ /* Wrap each instruction in a scheduler structure. */
+ while (!list_empty(&block->instructions)) {
+ struct qinst *qinst = (struct qinst *)block->instructions.next;
+ struct schedule_node *n =
+ rzalloc(mem_ctx, struct schedule_node);
+
+ n->inst = qinst;
+
+ list_del(&qinst->link);
+ list_addtail(&n->link, &schedule_list);
+ }
+
+ calculate_forward_deps(c, &schedule_list);
+ calculate_reverse_deps(c, &schedule_list);
+
+ list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
+ compute_delay(n);
+ }
+
+ uint32_t cycles = schedule_instructions(c, scoreboard, block,
+ &schedule_list,
+ orig_uniform_contents,
+ orig_uniform_data,
+ next_uniform);
+
+ ralloc_free(mem_ctx);
+
+ return cycles;
+}
+
+static void
+qpu_set_branch_targets(struct v3d_compile *c)
+{
+ vir_for_each_block(block, c) {
+ /* The end block of the program has no branch. */
+ if (!block->successors[0])
+ continue;
+
+ /* If there was no branch instruction, then the successor
+ * block must follow immediately after this one.
+ */
+ if (block->branch_qpu_ip == ~0) {
+ assert(block->end_qpu_ip + 1 ==
+ block->successors[0]->start_qpu_ip);
+ continue;
+ }
+
+ /* Walk back through the delay slots to find the branch
+ * instr.
+ */
+ struct list_head *entry = block->instructions.prev;
+ for (int i = 0; i < 3; i++)
+ entry = entry->prev;
+ struct qinst *branch = container_of(entry, branch, link);
+ assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+
+ /* Make sure that the if-we-don't-jump
+ * successor was scheduled just after the
+ * delay slots.
+ */
+ assert(!block->successors[1] ||
+ block->successors[1]->start_qpu_ip ==
+ block->branch_qpu_ip + 4);
+
+ branch->qpu.branch.offset =
+ ((block->successors[0]->start_qpu_ip -
+ (block->branch_qpu_ip + 4)) *
+ sizeof(uint64_t));
+
+ /* Set up the relative offset to jump in the
+ * uniform stream.
+ *
+ * Use a temporary here, because
+ * uniform_data[inst->uniform] may be shared
+ * between multiple instructions.
+ */
+ assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
+ c->uniform_data[branch->uniform] =
+ (block->successors[0]->start_uniform -
+ (block->branch_uniform + 1)) * 4;
+ }
+}
+
+uint32_t
+v3d_qpu_schedule_instructions(struct v3d_compile *c)
+{
+ const struct v3d_device_info *devinfo = c->devinfo;
+
+ /* We reorder the uniforms as we schedule instructions, so save the
+ * old data off and replace it.
+ */
+ uint32_t *uniform_data = c->uniform_data;
+ enum quniform_contents *uniform_contents = c->uniform_contents;
+ c->uniform_contents = ralloc_array(c, enum quniform_contents,
+ c->num_uniforms);
+ c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
+ c->uniform_array_size = c->num_uniforms;
+ uint32_t next_uniform = 0;
+
+ struct choose_scoreboard scoreboard;
+ memset(&scoreboard, 0, sizeof(scoreboard));
+ scoreboard.last_waddr_add = ~0;
+ scoreboard.last_waddr_mul = ~0;
+ scoreboard.last_ldvary_tick = -10;
+ scoreboard.last_sfu_write_tick = -10;
+ scoreboard.last_uniforms_reset_tick = -10;
+
+ if (debug) {
+ fprintf(stderr, "Pre-schedule instructions\n");
+ vir_for_each_block(block, c) {
+ fprintf(stderr, "BLOCK %d\n", block->index);
+ list_for_each_entry(struct qinst, qinst,
+ &block->instructions, link) {
+ v3d_qpu_dump(devinfo, &qinst->qpu);
+ fprintf(stderr, "\n");
+ }
+ }
+ fprintf(stderr, "\n");
+ }
+
+ uint32_t cycles = 0;
+ vir_for_each_block(block, c) {
+ block->start_qpu_ip = c->qpu_inst_count;
+ block->branch_qpu_ip = ~0;
+ block->start_uniform = next_uniform;
+
+ cycles += qpu_schedule_instructions_block(c,
+ &scoreboard,
+ block,
+ uniform_contents,
+ uniform_data,
+ &next_uniform);
+
+ block->end_qpu_ip = c->qpu_inst_count - 1;
+ }
+
+ qpu_set_branch_targets(c);
+
+ assert(next_uniform == c->num_uniforms);
+
+ return cycles;
+}
--- /dev/null
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * Validates the QPU instruction sequence after register allocation and
+ * scheduling.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "v3d_compiler.h"
+#include "qpu/qpu_disasm.h"
+
+struct v3d_qpu_validate_state {
+ struct v3d_compile *c;
+ const struct v3d_qpu_instr *last;
+ int ip;
+ int last_sfu_write;
+};
+
+static void
+fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
+{
+ struct v3d_compile *c = state->c;
+
+ fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
+
+ int dump_ip = 0;
+ vir_for_each_inst_inorder(inst, c) {
+ v3d_qpu_dump(c->devinfo, &inst->qpu);
+
+ if (dump_ip++ == state->ip)
+ fprintf(stderr, " *** ERROR ***");
+
+ fprintf(stderr, "\n");
+ }
+
+ fprintf(stderr, "\n");
+ abort();
+}
+
+static bool
+qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
+ bool (*predicate)(enum v3d_qpu_waddr waddr))
+{
+ if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
+ predicate(inst->alu.add.waddr))
+ return true;
+
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
+ predicate(inst->alu.mul.waddr))
+ return true;
+
+ return false;
+}
+
+static void
+qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return;
+
+ /* LDVARY writes r5 two instructions later and LDUNIF writes
+ * r5 one instruction later, which is illegal to have
+ * together.
+ */
+ if (state->last && state->last->sig.ldvary && inst->sig.ldunif) {
+ fail_instr(state, "LDUNIF after a LDVARY");
+ }
+
+ int tmu_writes = 0;
+ int sfu_writes = 0;
+ int vpm_writes = 0;
+ int tlb_writes = 0;
+ int tsy_writes = 0;
+
+ if (inst->alu.add.op != V3D_QPU_A_NOP) {
+ if (inst->alu.add.magic_write) {
+ if (v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr))
+ tmu_writes++;
+ if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
+ sfu_writes++;
+ if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
+ vpm_writes++;
+ if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
+ tlb_writes++;
+ if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
+ tsy_writes++;
+ }
+ }
+
+ if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+ if (inst->alu.mul.magic_write) {
+ if (v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr))
+ tmu_writes++;
+ if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
+ sfu_writes++;
+ if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
+ vpm_writes++;
+ if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
+ tlb_writes++;
+ if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
+ tsy_writes++;
+ }
+ }
+
+ (void)qpu_magic_waddr_matches; /* XXX */
+
+ /* SFU r4 results come back two instructions later. No doing
+ * r4 read/writes or other SFU lookups until it's done.
+ */
+ if (state->ip - state->last_sfu_write < 2) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
+ fail_instr(state, "R4 read too soon after SFU");
+
+ if (v3d_qpu_writes_r4(inst))
+ fail_instr(state, "R4 write too soon after SFU");
+
+ if (sfu_writes)
+ fail_instr(state, "SFU write too soon after SFU");
+ }
+
+ /* XXX: The docs say VPM can happen with the others, but the simulator
+ * disagrees.
+ */
+ if (tmu_writes +
+ sfu_writes +
+ vpm_writes +
+ tlb_writes +
+ tsy_writes +
+ inst->sig.ldtmu +
+ inst->sig.ldtlb +
+ inst->sig.ldvpm +
+ inst->sig.ldtlbu > 1) {
+ fail_instr(state,
+ "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
+ }
+
+ if (sfu_writes)
+ state->last_sfu_write = state->ip;
+}
+
+static void
+qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
+{
+ vir_for_each_inst(qinst, block) {
+ qpu_validate_inst(state, qinst);
+
+ state->last = &qinst->qpu;
+ state->ip++;
+ }
+}
+
+/**
+ * Checks for the instruction restrictions from page 37 ("Summary of
+ * Instruction Restrictions").
+ */
+void
+qpu_validate(struct v3d_compile *c)
+{
+ /* We don't want to do validation in release builds, but we want to
+ * keep compiling the validation code to make sure it doesn't get
+ * broken.
+ */
+#ifndef DEBUG
+ return;
+#endif
+
+ struct v3d_qpu_validate_state state = {
+ .c = c,
+ .last_sfu_write = -10,
+ .ip = 0,
+ };
+
+ vir_for_each_block(block, c) {
+ qpu_validate_block(&state, block);
+ }
+}
--- /dev/null
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+struct v3d_compiler *
+v3d_compiler_init(void)
+{
+ struct v3d_compile *c = rzalloc(struct v3d_compile);
+
+ return c;
+}
+
+void
+v3d_add_qpu_inst(struct v3d_compiler *c, uint64_t inst)
+{
+ if (c->qpu_inst_count >= c->qpu_inst_size) {
+ c->qpu_inst_size = MAX2(c->qpu_inst_size * 2, 16);
+ c->qpu_insts = reralloc(c, c->qpu_insts, uint64_t,
+ c->qpu_inst_size_array_size);
+
+ }
+
+ c->qpu_insts[c->qpu_inst_count++] = inst;
+}
--- /dev/null
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_COMPILER_H
+#define V3D_COMPILER_H
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/macros.h"
+#include "common/v3d_debug.h"
+#include "compiler/nir/nir.h"
+#include "util/list.h"
+#include "util/u_math.h"
+
+#include "qpu/qpu_instr.h"
+#include "pipe/p_state.h"
+
+#define V3D_MAX_TEXTURE_SAMPLERS 32
+#define V3D_MAX_SAMPLES 4
+#define V3D_MAX_FS_INPUTS 64
+#define V3D_MAX_VS_INPUTS 64
+
+struct nir_builder;
+
+struct v3d_fs_inputs {
+ /**
+ * Array of the meanings of the VPM inputs this shader needs.
+ *
+ * It doesn't include those that aren't part of the VPM, like
+ * point/line coordinates.
+ */
+ struct v3d_varying_slot *input_slots;
+ uint32_t num_inputs;
+};
+
+enum qfile {
+ /** An unused source or destination register. */
+ QFILE_NULL,
+
+ /** A physical register, such as the W coordinate payload. */
+ QFILE_REG,
+ /** One of the regsiters for fixed function interactions. */
+ QFILE_MAGIC,
+
+ /**
+ * A virtual register, that will be allocated to actual accumulator
+ * or physical registers later.
+ */
+ QFILE_TEMP,
+ QFILE_VARY,
+ QFILE_UNIF,
+ QFILE_TLB,
+ QFILE_TLBU,
+
+ /**
+ * VPM reads use this with an index value to say what part of the VPM
+ * is being read.
+ */
+ QFILE_VPM,
+
+ /**
+ * Stores an immediate value in the index field that will be used
+ * directly by qpu_load_imm().
+ */
+ QFILE_LOAD_IMM,
+
+ /**
+ * Stores an immediate value in the index field that can be turned
+ * into a small immediate field by qpu_encode_small_immediate().
+ */
+ QFILE_SMALL_IMM,
+};
+
+/**
+ * A reference to a QPU register or a virtual temp register.
+ */
+struct qreg {
+ enum qfile file;
+ uint32_t index;
+};
+
+static inline struct qreg vir_reg(enum qfile file, uint32_t index)
+{
+ return (struct qreg){file, index};
+}
+
+/**
+ * A reference to an actual register at the QPU level, for register
+ * allocation.
+ */
+struct qpu_reg {
+ bool magic;
+ int index;
+};
+
+struct qinst {
+ /** Entry in qblock->instructions */
+ struct list_head link;
+
+ /**
+ * The instruction being wrapped. Its condition codes, pack flags,
+ * signals, etc. will all be used, with just the register references
+ * being replaced by the contents of qinst->dst and qinst->src[].
+ */
+ struct v3d_qpu_instr qpu;
+
+ /* Pre-register-allocation references to src/dst registers */
+ struct qreg dst;
+ struct qreg src[3];
+ bool cond_is_exec_mask;
+ bool has_implicit_uniform;
+
+ /* After vir_to_qpu.c: If instr reads a uniform, which uniform from
+ * the uncompiled stream it is.
+ */
+ int uniform;
+};
+
+enum quniform_contents {
+ /**
+ * Indicates that a constant 32-bit value is copied from the program's
+ * uniform contents.
+ */
+ QUNIFORM_CONSTANT,
+ /**
+ * Indicates that the program's uniform contents are used as an index
+ * into the GL uniform storage.
+ */
+ QUNIFORM_UNIFORM,
+
+ /** @{
+ * Scaling factors from clip coordinates to relative to the viewport
+ * center.
+ *
+ * This is used by the coordinate and vertex shaders to produce the
+ * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed
+ * point offsets from the viewport ccenter.
+ */
+ QUNIFORM_VIEWPORT_X_SCALE,
+ QUNIFORM_VIEWPORT_Y_SCALE,
+ /** @} */
+
+ QUNIFORM_VIEWPORT_Z_OFFSET,
+ QUNIFORM_VIEWPORT_Z_SCALE,
+
+ QUNIFORM_USER_CLIP_PLANE,
+
+ /**
+ * A reference to a texture config parameter 0 uniform.
+ *
+ * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
+ * defines texture type, miplevels, and such. It will be found as a
+ * parameter to the first QOP_TEX_[STRB] instruction in a sequence.
+ */
+ QUNIFORM_TEXTURE_CONFIG_P0_0,
+ QUNIFORM_TEXTURE_CONFIG_P0_1,
+ QUNIFORM_TEXTURE_CONFIG_P0_2,
+ QUNIFORM_TEXTURE_CONFIG_P0_3,
+ QUNIFORM_TEXTURE_CONFIG_P0_4,
+ QUNIFORM_TEXTURE_CONFIG_P0_5,
+ QUNIFORM_TEXTURE_CONFIG_P0_6,
+ QUNIFORM_TEXTURE_CONFIG_P0_7,
+ QUNIFORM_TEXTURE_CONFIG_P0_8,
+ QUNIFORM_TEXTURE_CONFIG_P0_9,
+ QUNIFORM_TEXTURE_CONFIG_P0_10,
+ QUNIFORM_TEXTURE_CONFIG_P0_11,
+ QUNIFORM_TEXTURE_CONFIG_P0_12,
+ QUNIFORM_TEXTURE_CONFIG_P0_13,
+ QUNIFORM_TEXTURE_CONFIG_P0_14,
+ QUNIFORM_TEXTURE_CONFIG_P0_15,
+ QUNIFORM_TEXTURE_CONFIG_P0_16,
+ QUNIFORM_TEXTURE_CONFIG_P0_17,
+ QUNIFORM_TEXTURE_CONFIG_P0_18,
+ QUNIFORM_TEXTURE_CONFIG_P0_19,
+ QUNIFORM_TEXTURE_CONFIG_P0_20,
+ QUNIFORM_TEXTURE_CONFIG_P0_21,
+ QUNIFORM_TEXTURE_CONFIG_P0_22,
+ QUNIFORM_TEXTURE_CONFIG_P0_23,
+ QUNIFORM_TEXTURE_CONFIG_P0_24,
+ QUNIFORM_TEXTURE_CONFIG_P0_25,
+ QUNIFORM_TEXTURE_CONFIG_P0_26,
+ QUNIFORM_TEXTURE_CONFIG_P0_27,
+ QUNIFORM_TEXTURE_CONFIG_P0_28,
+ QUNIFORM_TEXTURE_CONFIG_P0_29,
+ QUNIFORM_TEXTURE_CONFIG_P0_30,
+ QUNIFORM_TEXTURE_CONFIG_P0_31,
+ QUNIFORM_TEXTURE_CONFIG_P0_32,
+
+ /**
+ * A reference to a texture config parameter 1 uniform.
+ *
+ * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
+ * defines texture width, height, filters, and wrap modes. It will be
+ * found as a parameter to the second QOP_TEX_[STRB] instruction in a
+ * sequence.
+ */
+ QUNIFORM_TEXTURE_CONFIG_P1,
+
+ QUNIFORM_TEXTURE_FIRST_LEVEL,
+
+ QUNIFORM_TEXTURE_WIDTH,
+ QUNIFORM_TEXTURE_HEIGHT,
+ QUNIFORM_TEXTURE_DEPTH,
+ QUNIFORM_TEXTURE_ARRAY_SIZE,
+ QUNIFORM_TEXTURE_LEVELS,
+
+ QUNIFORM_TEXTURE_MSAA_ADDR,
+
+ QUNIFORM_UBO_ADDR,
+
+ QUNIFORM_TEXRECT_SCALE_X,
+ QUNIFORM_TEXRECT_SCALE_Y,
+
+ QUNIFORM_TEXTURE_BORDER_COLOR,
+
+ QUNIFORM_STENCIL,
+
+ QUNIFORM_ALPHA_REF,
+ QUNIFORM_SAMPLE_MASK,
+};
+
+struct v3d_varying_slot {
+ uint8_t slot_and_component;
+};
+
+static inline struct v3d_varying_slot
+v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component)
+{
+ assert(slot < 255 / 4);
+ return (struct v3d_varying_slot){ (slot << 2) + component };
+}
+
+static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot)
+{
+ return slot.slot_and_component >> 2;
+}
+
+static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
+{
+ return slot.slot_and_component & 3;
+}
+
+struct v3d_ubo_range {
+ /**
+ * offset in bytes from the start of the ubo where this range is
+ * uploaded.
+ *
+ * Only set once used is set.
+ */
+ uint32_t dst_offset;
+
+ /**
+ * offset in bytes from the start of the gallium uniforms where the
+ * data comes from.
+ */
+ uint32_t src_offset;
+
+ /** size in bytes of this ubo range */
+ uint32_t size;
+};
+
+struct v3d_key {
+ void *shader_state;
+ struct {
+ uint8_t swizzle[4];
+ uint8_t return_size;
+ uint8_t return_channels;
+ union {
+ struct {
+ unsigned compare_mode:1;
+ unsigned compare_func:3;
+ unsigned wrap_s:3;
+ unsigned wrap_t:3;
+ };
+ struct {
+ uint16_t msaa_width, msaa_height;
+ };
+ };
+ } tex[V3D_MAX_TEXTURE_SAMPLERS];
+ uint8_t ucp_enables;
+};
+
+struct v3d_fs_key {
+ struct v3d_key base;
+ bool depth_enabled;
+ bool is_points;
+ bool is_lines;
+ bool alpha_test;
+ bool point_coord_upper_left;
+ bool light_twoside;
+ bool msaa;
+ bool sample_coverage;
+ bool sample_alpha_to_coverage;
+ bool sample_alpha_to_one;
+ bool clamp_color;
+ bool swap_color_rb;
+ uint8_t alpha_test_func;
+ uint8_t logicop_func;
+ uint32_t point_sprite_mask;
+
+ struct pipe_rt_blend_state blend;
+};
+
+struct v3d_vs_key {
+ struct v3d_key base;
+
+ struct v3d_varying_slot fs_inputs[V3D_MAX_FS_INPUTS];
+ uint8_t num_fs_inputs;
+
+ bool is_coord;
+ bool per_vertex_point_size;
+ bool clamp_color;
+};
+
+/** A basic block of VIR intructions. */
+struct qblock {
+ struct list_head link;
+
+ struct list_head instructions;
+
+ struct set *predecessors;
+ struct qblock *successors[2];
+
+ int index;
+
+ /* Instruction IPs for the first and last instruction of the block.
+ * Set by qpu_schedule.c.
+ */
+ uint32_t start_qpu_ip;
+ uint32_t end_qpu_ip;
+
+ /* Instruction IP for the branch instruction of the block. Set by
+ * qpu_schedule.c.
+ */
+ uint32_t branch_qpu_ip;
+
+ /** Offset within the uniform stream at the start of the block. */
+ uint32_t start_uniform;
+ /** Offset within the uniform stream of the branch instruction */
+ uint32_t branch_uniform;
+
+ /** @{ used by v3d_vir_live_variables.c */
+ BITSET_WORD *def;
+ BITSET_WORD *use;
+ BITSET_WORD *live_in;
+ BITSET_WORD *live_out;
+ int start_ip, end_ip;
+ /** @} */
+};
+
+/**
+ * Compiler state saved across compiler invocations, for any expensive global
+ * setup.
+ */
+struct v3d_compiler {
+ const struct v3d_device_info *devinfo;
+ struct ra_regs *regs;
+ unsigned int reg_class[3];
+};
+
+struct v3d_compile {
+ const struct v3d_device_info *devinfo;
+ nir_shader *s;
+ nir_function_impl *impl;
+ struct exec_list *cf_node_list;
+ const struct v3d_compiler *compiler;
+
+ /**
+ * Mapping from nir_register * or nir_ssa_def * to array of struct
+ * qreg for the values.
+ */
+ struct hash_table *def_ht;
+
+ /* For each temp, the instruction generating its value. */
+ struct qinst **defs;
+ uint32_t defs_array_size;
+
+ /**
+ * Inputs to the shader, arranged by TGSI declaration order.
+ *
+ * Not all fragment shader QFILE_VARY reads are present in this array.
+ */
+ struct qreg *inputs;
+ struct qreg *outputs;
+ bool msaa_per_sample_output;
+ struct qreg color_reads[V3D_MAX_SAMPLES];
+ struct qreg sample_colors[V3D_MAX_SAMPLES];
+ uint32_t inputs_array_size;
+ uint32_t outputs_array_size;
+ uint32_t uniforms_array_size;
+
+ /* Booleans for whether the corresponding QFILE_VARY[i] is
+ * flat-shaded. This doesn't count gl_FragColor flat-shading, which is
+ * controlled by shader->color_inputs and rasterizer->flatshade in the
+ * gallium driver.
+ */
+ BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+ struct v3d_ubo_range *ubo_ranges;
+ bool *ubo_range_used;
+ uint32_t ubo_ranges_array_size;
+ /** Number of uniform areas tracked in ubo_ranges. */
+ uint32_t num_ubo_ranges;
+ uint32_t next_ubo_dst_offset;
+
+ /* State for whether we're executing on each channel currently. 0 if
+ * yes, otherwise a block number + 1 that the channel jumped to.
+ */
+ struct qreg execute;
+
+ struct qreg line_x, point_x, point_y;
+
+ /**
+ * Instance ID, which comes in before the vertex attribute payload if
+ * the shader record requests it.
+ */
+ struct qreg iid;
+
+ /**
+ * Vertex ID, which comes in before the vertex attribute payload
+ * (after Instance ID) if the shader record requests it.
+ */
+ struct qreg vid;
+
+ /* Fragment shader payload regs. */
+ struct qreg payload_w, payload_w_centroid, payload_z;
+
+ /** boolean (~0 -> true) if the fragment has been discarded. */
+ struct qreg discard;
+
+ uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
+ uint32_t num_vpm_writes;
+
+ /**
+ * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
+ *
+ * This includes those that aren't part of the VPM varyings, like
+ * point/line coordinates.
+ */
+ struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
+
+ /**
+ * An entry per outputs[] in the VS indicating what the VARYING_SLOT_*
+ * of the output is. Used to emit from the VS in the order that the
+ * FS needs.
+ */
+ struct v3d_varying_slot *output_slots;
+
+ struct pipe_shader_state *shader_state;
+ struct v3d_key *key;
+ struct v3d_fs_key *fs_key;
+ struct v3d_vs_key *vs_key;
+
+ /* Live ranges of temps. */
+ int *temp_start, *temp_end;
+
+ uint32_t *uniform_data;
+ enum quniform_contents *uniform_contents;
+ uint32_t uniform_array_size;
+ uint32_t num_uniforms;
+ uint32_t num_outputs;
+ uint32_t output_position_index;
+ nir_variable *output_color_var;
+ uint32_t output_point_size_index;
+ uint32_t output_sample_mask_index;
+
+ struct qreg undef;
+ uint32_t num_temps;
+
+ struct list_head blocks;
+ int next_block_index;
+ struct qblock *cur_block;
+ struct qblock *loop_cont_block;
+ struct qblock *loop_break_block;
+
+ uint64_t *qpu_insts;
+ uint32_t qpu_inst_count;
+ uint32_t qpu_inst_size;
+
+ /* For the FS, the number of varying inputs not counting the
+ * point/line varyings payload
+ */
+ uint32_t num_inputs;
+
+ /**
+ * Number of inputs from num_inputs remaining to be queued to the read
+ * FIFO in the VS/CS.
+ */
+ uint32_t num_inputs_remaining;
+
+ /* Number of inputs currently in the read FIFO for the VS/CS */
+ uint32_t num_inputs_in_fifo;
+
+ /** Next offset in the VPM to read from in the VS/CS */
+ uint32_t vpm_read_offset;
+
+ uint32_t program_id;
+ uint32_t variant_id;
+
+ /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
+ * is used to hide texturing latency at the cost of limiting ourselves
+ * to the bottom half of physical reg space.
+ */
+ bool fs_threaded;
+
+ bool last_thrsw_at_top_level;
+
+ bool failed;
+};
+
+struct v3d_uniform_list {
+ enum quniform_contents *contents;
+ uint32_t *data;
+ uint32_t count;
+};
+
+struct v3d_prog_data {
+ struct v3d_uniform_list uniforms;
+
+ struct v3d_ubo_range *ubo_ranges;
+ uint32_t num_ubo_ranges;
+ uint32_t ubo_size;
+
+ uint8_t num_inputs;
+
+};
+
+struct v3d_vs_prog_data {
+ struct v3d_prog_data base;
+
+ bool uses_iid, uses_vid;
+
+ /* Number of components read from each vertex attribute. */
+ uint8_t vattr_sizes[32];
+
+ /* Total number of components read, for the shader state record. */
+ uint32_t vpm_input_size;
+
+ /* Total number of components written, for the shader state record. */
+ uint32_t vpm_output_size;
+};
+
+struct v3d_fs_prog_data {
+ struct v3d_prog_data base;
+
+ struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
+
+ /** bitmask of which inputs are color inputs, for flat shade handling. */
+ uint32_t color_inputs[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+ /* Bitmask for whether the corresponding input is flat-shaded,
+ * independent of rasterizer (gl_FragColor) flat-shading.
+ */
+ BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+ bool writes_z;
+};
+
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define V3D_NIR_TLB_COLOR_READ_INPUT 2000000000
+
+#define V3D_NIR_MS_MASK_OUTPUT 2000000000
+
+extern const nir_shader_compiler_options v3d_nir_options;
+
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+void v3d_compiler_free(const struct v3d_compiler *compiler);
+void v3d_optimize_nir(struct nir_shader *s);
+
+uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
+ struct v3d_vs_key *key,
+ struct v3d_vs_prog_data *prog_data,
+ nir_shader *s,
+ int program_id, int variant_id,
+ uint32_t *final_assembly_size);
+
+uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler,
+ struct v3d_fs_key *key,
+ struct v3d_fs_prog_data *prog_data,
+ nir_shader *s,
+ int program_id, int variant_id,
+ uint32_t *final_assembly_size);
+
+void v3d_nir_to_vir(struct v3d_compile *c);
+
+void vir_compile_destroy(struct v3d_compile *c);
+const char *vir_get_stage_name(struct v3d_compile *c);
+struct qblock *vir_new_block(struct v3d_compile *c);
+void vir_set_emit_block(struct v3d_compile *c, struct qblock *block);
+void vir_link_blocks(struct qblock *predecessor, struct qblock *successor);
+struct qblock *vir_entry_block(struct v3d_compile *c);
+struct qblock *vir_exit_block(struct v3d_compile *c);
+struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
+ struct qreg src0, struct qreg src1);
+struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
+ struct qreg src0, struct qreg src1);
+struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0);
+void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
+struct qreg vir_uniform(struct v3d_compile *c,
+ enum quniform_contents contents,
+ uint32_t data);
+void vir_schedule_instructions(struct v3d_compile *c);
+struct v3d_qpu_instr v3d_qpu_nop(void);
+
+struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
+struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst);
+void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond);
+void vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf);
+void vir_set_unpack(struct qinst *inst, int src,
+ enum v3d_qpu_input_unpack unpack);
+
+struct qreg vir_get_temp(struct v3d_compile *c);
+void vir_calculate_live_intervals(struct v3d_compile *c);
+bool vir_has_implicit_uniform(struct qinst *inst);
+int vir_get_implicit_uniform_src(struct qinst *inst);
+int vir_get_non_sideband_nsrc(struct qinst *inst);
+int vir_get_nsrc(struct qinst *inst);
+bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
+bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
+bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op);
+bool vir_is_raw_mov(struct qinst *inst);
+bool vir_is_tex(struct qinst *inst);
+bool vir_is_add(struct qinst *inst);
+bool vir_is_mul(struct qinst *inst);
+bool vir_is_float_input(struct qinst *inst);
+bool vir_depends_on_flags(struct qinst *inst);
+bool vir_writes_r3(struct qinst *inst);
+bool vir_writes_r4(struct qinst *inst);
+struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
+uint8_t vir_channels_written(struct qinst *inst);
+
+void vir_dump(struct v3d_compile *c);
+void vir_dump_inst(struct v3d_compile *c, struct qinst *inst);
+
+void vir_validate(struct v3d_compile *c);
+
+void vir_optimize(struct v3d_compile *c);
+bool vir_opt_algebraic(struct v3d_compile *c);
+bool vir_opt_constant_folding(struct v3d_compile *c);
+bool vir_opt_copy_propagate(struct v3d_compile *c);
+bool vir_opt_dead_code(struct v3d_compile *c);
+bool vir_opt_peephole_sf(struct v3d_compile *c);
+bool vir_opt_small_immediates(struct v3d_compile *c);
+bool vir_opt_vpm(struct v3d_compile *c);
+void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
+void vir_lower_uniforms(struct v3d_compile *c);
+
+void v3d_vir_to_qpu(struct v3d_compile *c);
+uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
+void qpu_validate(struct v3d_compile *c);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
+bool vir_init_reg_sets(struct v3d_compiler *compiler);
+
+void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
+
+static inline bool
+quniform_contents_is_texture_p0(enum quniform_contents contents)
+{
+ return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 &&
+ contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 +
+ V3D_MAX_TEXTURE_SAMPLERS));
+}
+
+static inline struct qreg
+vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
+{
+ return vir_uniform(c, QUNIFORM_CONSTANT, ui);
+}
+
+static inline struct qreg
+vir_uniform_f(struct v3d_compile *c, float f)
+{
+ return vir_uniform(c, QUNIFORM_CONSTANT, fui(f));
+}
+
+#define VIR_ALU0(name, vir_inst, op) \
+static inline struct qreg \
+vir_##name(struct v3d_compile *c) \
+{ \
+ return vir_emit_def(c, vir_inst(op, c->undef, \
+ c->undef, c->undef)); \
+} \
+static inline struct qinst * \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest) \
+{ \
+ return vir_emit_nondef(c, vir_inst(op, dest, \
+ c->undef, c->undef)); \
+}
+
+#define VIR_ALU1(name, vir_inst, op) \
+static inline struct qreg \
+vir_##name(struct v3d_compile *c, struct qreg a) \
+{ \
+ return vir_emit_def(c, vir_inst(op, c->undef, \
+ a, c->undef)); \
+} \
+static inline struct qinst * \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \
+ struct qreg a) \
+{ \
+ return vir_emit_nondef(c, vir_inst(op, dest, a, \
+ c->undef)); \
+}
+
+#define VIR_ALU2(name, vir_inst, op) \
+static inline struct qreg \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_def(c, vir_inst(op, c->undef, a, b)); \
+} \
+static inline struct qinst * \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \
+ struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_nondef(c, vir_inst(op, dest, a, b)); \
+}
+
+#define VIR_NODST_1(name, vir_inst, op) \
+static inline struct qinst * \
+vir_##name(struct v3d_compile *c, struct qreg a) \
+{ \
+ return vir_emit_nondef(c, vir_inst(op, c->undef, \
+ a, c->undef)); \
+}
+
+#define VIR_NODST_2(name, vir_inst, op) \
+static inline struct qinst * \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_nondef(c, vir_inst(op, c->undef, \
+ a, b)); \
+}
+
+#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name)
+
+VIR_A_ALU2(FADD)
+VIR_A_ALU2(VFPACK)
+VIR_A_ALU2(FSUB)
+VIR_A_ALU2(FMIN)
+VIR_A_ALU2(FMAX)
+
+VIR_A_ALU2(ADD)
+VIR_A_ALU2(SUB)
+VIR_A_ALU2(SHL)
+VIR_A_ALU2(SHR)
+VIR_A_ALU2(ASR)
+VIR_A_ALU2(ROR)
+VIR_A_ALU2(MIN)
+VIR_A_ALU2(MAX)
+VIR_A_ALU2(UMIN)
+VIR_A_ALU2(UMAX)
+VIR_A_ALU2(AND)
+VIR_A_ALU2(OR)
+VIR_A_ALU2(XOR)
+VIR_A_ALU2(VADD)
+VIR_A_ALU2(VSUB)
+VIR_A_ALU1(NOT)
+VIR_A_ALU1(NEG)
+VIR_A_ALU1(FLAPUSH)
+VIR_A_ALU1(FLBPUSH)
+VIR_A_ALU1(FLBPOP)
+VIR_A_ALU1(SETMSF)
+VIR_A_ALU1(SETREVF)
+VIR_A_ALU1(TIDX)
+VIR_A_ALU1(EIDX)
+
+VIR_A_ALU0(FXCD)
+VIR_A_ALU0(XCD)
+VIR_A_ALU0(FYCD)
+VIR_A_ALU0(YCD)
+VIR_A_ALU0(MSF)
+VIR_A_ALU0(REVF)
+VIR_A_NODST_1(VPMSETUP)
+VIR_A_ALU2(FCMP)
+VIR_A_ALU2(VFMAX)
+
+VIR_A_ALU1(FROUND)
+VIR_A_ALU1(FTOIN)
+VIR_A_ALU1(FTRUNC)
+VIR_A_ALU1(FTOIZ)
+VIR_A_ALU1(FFLOOR)
+VIR_A_ALU1(FTOUZ)
+VIR_A_ALU1(FCEIL)
+VIR_A_ALU1(FTOC)
+
+VIR_A_ALU1(FDX)
+VIR_A_ALU1(FDY)
+
+VIR_A_ALU1(ITOF)
+VIR_A_ALU1(CLZ)
+VIR_A_ALU1(UTOF)
+
+VIR_M_ALU2(UMUL24)
+VIR_M_ALU2(FMUL)
+VIR_M_ALU2(SMUL24)
+VIR_M_NODST_2(MULTOP)
+
+VIR_M_ALU1(MOV)
+VIR_M_ALU1(FMOV)
+
+static inline struct qinst *
+vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
+ struct qreg dest, struct qreg src)
+{
+ struct qinst *mov = vir_MOV_dest(c, dest, src);
+ vir_set_cond(mov, cond);
+ return mov;
+}
+
+static inline struct qreg
+vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond,
+ struct qreg src0, struct qreg src1)
+{
+ struct qreg t = vir_get_temp(c);
+ vir_MOV_dest(c, t, src1);
+ vir_MOV_cond(c, cond, t, src0);
+ return t;
+}
+
+static inline void
+vir_VPM_WRITE(struct v3d_compile *c, struct qreg val)
+{
+ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
+}
+
+static inline struct qinst *
+vir_NOP(struct v3d_compile *c)
+{
+ return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP,
+ c->undef, c->undef, c->undef));
+}
+/*
+static inline struct qreg
+vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
+{
+ return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
+ vir_reg(QFILE_LOAD_IMM, val), c->undef));
+}
+
+static inline struct qreg
+vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
+{
+ return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
+ vir_reg(QFILE_LOAD_IMM, val),
+ c->undef));
+}
+static inline struct qreg
+vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
+{
+ return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
+ vir_reg(QFILE_LOAD_IMM, val),
+ c->undef));
+}
+*/
+
+static inline struct qinst *
+vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+ /* The actual uniform_data value will be set at scheduling time */
+ return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0)));
+}
+
+#define vir_for_each_block(block, c) \
+ list_for_each_entry(struct qblock, block, &c->blocks, link)
+
+#define vir_for_each_block_rev(block, c) \
+ list_for_each_entry_rev(struct qblock, block, &c->blocks, link)
+
+/* Loop over the non-NULL members of the successors array. */
+#define vir_for_each_successor(succ, block) \
+ for (struct qblock *succ = block->successors[0]; \
+ succ != NULL; \
+ succ = (succ == block->successors[1] ? NULL : \
+ block->successors[1]))
+
+#define vir_for_each_inst(inst, block) \
+ list_for_each_entry(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_rev(inst, block) \
+ list_for_each_entry_rev(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_safe(inst, block) \
+ list_for_each_entry_safe(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_inorder(inst, c) \
+ vir_for_each_block(_block, c) \
+ vir_for_each_inst(inst, _block)
+
+#endif /* V3D_COMPILER_H */
--- /dev/null
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
+ * intrinsics into something amenable to the V3D architecture.
+ *
+ * Currently, it splits VS inputs and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads. FS input and VS output scalarization is handled by
+ * nir_lower_io_to_scalar().
+ */
+
+static void
+replace_intrinsic_with_vec(nir_builder *b, nir_intrinsic_instr *intr,
+ nir_ssa_def **comps)
+{
+
+ /* Batch things back together into a vector. This will get split by
+ * the later ALU scalarization pass.
+ */
+ nir_ssa_def *vec = nir_vec(b, comps, intr->num_components);
+
+ /* Replace the old intrinsic with a reference to our reconstructed
+ * vector.
+ */
+ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec));
+ nir_instr_remove(&intr->instr);
+}
+
+static void
+v3d_nir_lower_output(struct v3d_compile *c, nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ nir_variable *output_var = NULL;
+ nir_foreach_variable(var, &c->s->outputs) {
+ if (var->data.driver_location == nir_intrinsic_base(intr)) {
+ output_var = var;
+ break;
+ }
+ }
+ assert(output_var);
+
+ if (c->vs_key) {
+ int slot = output_var->data.location;
+ bool used = false;
+
+ switch (slot) {
+ case VARYING_SLOT_PSIZ:
+ case VARYING_SLOT_POS:
+ used = true;
+ break;
+
+ default:
+ for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+ if (v3d_slot_get_slot(c->vs_key->fs_inputs[i]) == slot) {
+ used = true;
+ break;
+ }
+ }
+ break;
+ }
+
+ if (!used)
+ nir_instr_remove(&intr->instr);
+ }
+}
+
+static void
+v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ b->cursor = nir_before_instr(&intr->instr);
+
+ /* Generate scalar loads equivalent to the original vector. */
+ nir_ssa_def *dests[4];
+ for (unsigned i = 0; i < intr->num_components; i++) {
+ nir_intrinsic_instr *intr_comp =
+ nir_intrinsic_instr_create(c->s, intr->intrinsic);
+ intr_comp->num_components = 1;
+ nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
+
+ /* Convert the uniform offset to bytes. If it happens
+ * to be a constant, constant-folding will clean up
+ * the shift for us.
+ */
+ nir_intrinsic_set_base(intr_comp,
+ nir_intrinsic_base(intr) * 16 +
+ i * 4);
+
+ intr_comp->src[0] =
+ nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
+ nir_imm_int(b, 4)));
+
+ dests[i] = &intr_comp->dest.ssa;
+
+ nir_builder_instr_insert(b, &intr_comp->instr);
+ }
+
+ replace_intrinsic_with_vec(b, intr, dests);
+}
+
+static void
+v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
+ struct nir_instr *instr)
+{
+ if (instr->type != nir_instr_type_intrinsic)
+ return;
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_input:
+ break;
+
+ case nir_intrinsic_store_output:
+ v3d_nir_lower_output(c, b, intr);
+ break;
+
+ case nir_intrinsic_load_uniform:
+ v3d_nir_lower_uniform(c, b, intr);
+ break;
+
+ case nir_intrinsic_load_user_clip_plane:
+ default:
+ break;
+ }
+}
+
+static bool
+v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl)
+{
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block)
+ v3d_nir_lower_io_instr(c, &b, instr);
+ }
+
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+
+ return true;
+}
+
+void
+v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
+{
+ nir_foreach_function(function, s) {
+ if (function->impl)
+ v3d_nir_lower_io_impl(c, function->impl);
+ }
+}
--- /dev/null
+/*
+ * Copyright © 2016-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+int
+vir_get_non_sideband_nsrc(struct qinst *inst)
+{
+ switch (inst->qpu.type) {
+ case V3D_QPU_INSTR_TYPE_BRANCH:
+ return 0;
+ case V3D_QPU_INSTR_TYPE_ALU:
+ if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
+ return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+ else
+ return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+ }
+
+ return 0;
+}
+
+int
+vir_get_nsrc(struct qinst *inst)
+{
+ int nsrc = vir_get_non_sideband_nsrc(inst);
+
+ if (vir_has_implicit_uniform(inst))
+ nsrc++;
+
+ return nsrc;
+}
+
+bool
+vir_has_implicit_uniform(struct qinst *inst)
+{
+ switch (inst->qpu.type) {
+ case V3D_QPU_INSTR_TYPE_BRANCH:
+ return true;
+ case V3D_QPU_INSTR_TYPE_ALU:
+ switch (inst->dst.file) {
+ case QFILE_TLBU:
+ return true;
+ default:
+ return inst->has_implicit_uniform;
+ }
+ }
+ return false;
+}
+
+/* The sideband uniform for textures gets stored after the normal ALU
+ * arguments.
+ */
+int
+vir_get_implicit_uniform_src(struct qinst *inst)
+{
+ return vir_get_nsrc(inst) - 1;
+}
+
+/**
+ * Returns whether the instruction has any side effects that must be
+ * preserved.
+ */
+bool
+vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
+{
+ switch (inst->qpu.type) {
+ case V3D_QPU_INSTR_TYPE_BRANCH:
+ return true;
+ case V3D_QPU_INSTR_TYPE_ALU:
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_SETREVF:
+ case V3D_QPU_A_SETMSF:
+ case V3D_QPU_A_VPMSETUP:
+ return true;
+ default:
+ break;
+ }
+
+ switch (inst->qpu.alu.mul.op) {
+ case V3D_QPU_M_MULTOP:
+ return true;
+ default:
+ break;
+ }
+ }
+
+ if (inst->qpu.sig.ldtmu)
+ return true;
+
+ return false;
+}
+
+bool
+vir_is_float_input(struct qinst *inst)
+{
+ /* XXX: More instrs */
+ switch (inst->qpu.type) {
+ case V3D_QPU_INSTR_TYPE_BRANCH:
+ return false;
+ case V3D_QPU_INSTR_TYPE_ALU:
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FTOIN:
+ return true;
+ default:
+ break;
+ }
+
+ switch (inst->qpu.alu.mul.op) {
+ case V3D_QPU_M_FMOV:
+ case V3D_QPU_M_VFMUL:
+ case V3D_QPU_M_FMUL:
+ return true;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
+
+bool
+vir_is_raw_mov(struct qinst *inst)
+{
+ if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
+ inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
+ return false;
+ }
+
+ if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
+ return false;
+ }
+
+ if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+ inst->qpu.flags.mc != V3D_QPU_COND_NONE)
+ return false;
+
+ return true;
+}
+
+bool
+vir_is_add(struct qinst *inst)
+{
+ return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ inst->qpu.alu.add.op != V3D_QPU_A_NOP);
+}
+
+bool
+vir_is_mul(struct qinst *inst)
+{
+ return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
+}
+
+bool
+vir_is_tex(struct qinst *inst)
+{
+ if (inst->dst.file == QFILE_MAGIC)
+ return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
+
+ return false;
+}
+
+bool
+vir_depends_on_flags(struct qinst *inst)
+{
+ if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
+ return (inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS);
+ } else {
+ return (inst->qpu.flags.ac != V3D_QPU_COND_NONE &&
+ inst->qpu.flags.mc != V3D_QPU_COND_NONE);
+ }
+}
+
+bool
+vir_writes_r3(struct qinst *inst)
+{
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ switch (inst->src[i].file) {
+ case QFILE_VARY:
+ case QFILE_VPM:
+ return true;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
+
+bool
+vir_writes_r4(struct qinst *inst)
+{
+ switch (inst->dst.file) {
+ case QFILE_MAGIC:
+ switch (inst->dst.index) {
+ case V3D_QPU_WADDR_RECIP:
+ case V3D_QPU_WADDR_RSQRT:
+ case V3D_QPU_WADDR_EXP:
+ case V3D_QPU_WADDR_LOG:
+ case V3D_QPU_WADDR_SIN:
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (inst->qpu.sig.ldtmu)
+ return true;
+
+ return false;
+}
+
+void
+vir_set_unpack(struct qinst *inst, int src,
+ enum v3d_qpu_input_unpack unpack)
+{
+ assert(src == 0 || src == 1);
+
+ if (vir_is_add(inst)) {
+ if (src == 0)
+ inst->qpu.alu.add.a_unpack = unpack;
+ else
+ inst->qpu.alu.add.b_unpack = unpack;
+ } else {
+ assert(vir_is_mul(inst));
+ if (src == 0)
+ inst->qpu.alu.mul.a_unpack = unpack;
+ else
+ inst->qpu.alu.mul.b_unpack = unpack;
+ }
+}
+
+void
+vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
+{
+ if (vir_is_add(inst)) {
+ inst->qpu.flags.ac = cond;
+ } else {
+ assert(vir_is_mul(inst));
+ inst->qpu.flags.mc = cond;
+ }
+}
+
+void
+vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
+{
+ if (vir_is_add(inst)) {
+ inst->qpu.flags.apf = pf;
+ } else {
+ assert(vir_is_mul(inst));
+ inst->qpu.flags.mpf = pf;
+ }
+}
+
+#if 0
+uint8_t
+vir_channels_written(struct qinst *inst)
+{
+ if (vir_is_mul(inst)) {
+ switch (inst->dst.pack) {
+ case QPU_PACK_MUL_NOP:
+ case QPU_PACK_MUL_8888:
+ return 0xf;
+ case QPU_PACK_MUL_8A:
+ return 0x1;
+ case QPU_PACK_MUL_8B:
+ return 0x2;
+ case QPU_PACK_MUL_8C:
+ return 0x4;
+ case QPU_PACK_MUL_8D:
+ return 0x8;
+ }
+ } else {
+ switch (inst->dst.pack) {
+ case QPU_PACK_A_NOP:
+ case QPU_PACK_A_8888:
+ case QPU_PACK_A_8888_SAT:
+ case QPU_PACK_A_32_SAT:
+ return 0xf;
+ case QPU_PACK_A_8A:
+ case QPU_PACK_A_8A_SAT:
+ return 0x1;
+ case QPU_PACK_A_8B:
+ case QPU_PACK_A_8B_SAT:
+ return 0x2;
+ case QPU_PACK_A_8C:
+ case QPU_PACK_A_8C_SAT:
+ return 0x4;
+ case QPU_PACK_A_8D:
+ case QPU_PACK_A_8D_SAT:
+ return 0x8;
+ case QPU_PACK_A_16A:
+ case QPU_PACK_A_16A_SAT:
+ return 0x3;
+ case QPU_PACK_A_16B:
+ case QPU_PACK_A_16B_SAT:
+ return 0xc;
+ }
+ }
+ unreachable("Bad pack field");
+}
+#endif
+
+struct qreg
+vir_get_temp(struct v3d_compile *c)
+{
+ struct qreg reg;
+
+ reg.file = QFILE_TEMP;
+ reg.index = c->num_temps++;
+
+ if (c->num_temps > c->defs_array_size) {
+ uint32_t old_size = c->defs_array_size;
+ c->defs_array_size = MAX2(old_size * 2, 16);
+ c->defs = reralloc(c, c->defs, struct qinst *,
+ c->defs_array_size);
+ memset(&c->defs[old_size], 0,
+ sizeof(c->defs[0]) * (c->defs_array_size - old_size));
+ }
+
+ return reg;
+}
+
+struct qinst *
+vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+ struct qinst *inst = calloc(1, sizeof(*inst));
+
+ inst->qpu = v3d_qpu_nop();
+ inst->qpu.alu.add.op = op;
+
+ inst->dst = dst;
+ inst->src[0] = src0;
+ inst->src[1] = src1;
+ inst->uniform = ~0;
+
+ return inst;
+}
+
+struct qinst *
+vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+ struct qinst *inst = calloc(1, sizeof(*inst));
+
+ inst->qpu = v3d_qpu_nop();
+ inst->qpu.alu.mul.op = op;
+
+ inst->dst = dst;
+ inst->src[0] = src0;
+ inst->src[1] = src1;
+ inst->uniform = ~0;
+
+ return inst;
+}
+
+struct qinst *
+vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
+{
+ struct qinst *inst = calloc(1, sizeof(*inst));
+
+ inst->qpu = v3d_qpu_nop();
+ inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
+ inst->qpu.branch.cond = cond;
+ inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
+ inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
+ inst->qpu.branch.ub = true;
+ inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
+
+ inst->dst = vir_reg(QFILE_NULL, 0);
+ inst->src[0] = src;
+ inst->uniform = ~0;
+
+ return inst;
+}
+
+static void
+vir_emit(struct v3d_compile *c, struct qinst *inst)
+{
+ list_addtail(&inst->link, &c->cur_block->instructions);
+
+ if (inst->dst.file == QFILE_MAGIC &&
+ inst->dst.index == V3D_QPU_WADDR_VPM)
+ c->num_vpm_writes++;
+}
+
+/* Updates inst to write to a new temporary, emits it, and notes the def. */
+struct qreg
+vir_emit_def(struct v3d_compile *c, struct qinst *inst)
+{
+ assert(inst->dst.file == QFILE_NULL);
+
+ inst->dst = vir_get_temp(c);
+
+ if (inst->dst.file == QFILE_TEMP)
+ c->defs[inst->dst.index] = inst;
+
+ vir_emit(c, inst);
+
+ return inst->dst;
+}
+
+struct qinst *
+vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
+{
+ if (inst->dst.file == QFILE_TEMP)
+ c->defs[inst->dst.index] = NULL;
+
+ vir_emit(c, inst);
+
+ return inst;
+}
+
+struct qblock *
+vir_new_block(struct v3d_compile *c)
+{
+ struct qblock *block = rzalloc(c, struct qblock);
+
+ list_inithead(&block->instructions);
+
+ block->predecessors = _mesa_set_create(block,
+ _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+ block->index = c->next_block_index++;
+
+ return block;
+}
+
+void
+vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
+{
+ c->cur_block = block;
+ list_addtail(&block->link, &c->blocks);
+}
+
+struct qblock *
+vir_entry_block(struct v3d_compile *c)
+{
+ return list_first_entry(&c->blocks, struct qblock, link);
+}
+
+struct qblock *
+vir_exit_block(struct v3d_compile *c)
+{
+ return list_last_entry(&c->blocks, struct qblock, link);
+}
+
+void
+vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
+{
+ _mesa_set_add(successor->predecessors, predecessor);
+ if (predecessor->successors[0]) {
+ assert(!predecessor->successors[1]);
+ predecessor->successors[1] = successor;
+ } else {
+ predecessor->successors[0] = successor;
+ }
+}
+
+const struct v3d_compiler *
+v3d_compiler_init(const struct v3d_device_info *devinfo)
+{
+ struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
+ if (!compiler)
+ return NULL;
+
+ compiler->devinfo = devinfo;
+
+ if (!vir_init_reg_sets(compiler)) {
+ ralloc_free(compiler);
+ return NULL;
+ }
+
+ return compiler;
+}
+
+void
+v3d_compiler_free(const struct v3d_compiler *compiler)
+{
+ ralloc_free((void *)compiler);
+}
+
+static struct v3d_compile *
+vir_compile_init(const struct v3d_compiler *compiler,
+ struct v3d_key *key,
+ nir_shader *s,
+ int program_id, int variant_id)
+{
+ struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
+
+ c->compiler = compiler;
+ c->devinfo = compiler->devinfo;
+ c->key = key;
+ c->program_id = program_id;
+ c->variant_id = variant_id;
+
+ s = nir_shader_clone(c, s);
+ c->s = s;
+
+ list_inithead(&c->blocks);
+ vir_set_emit_block(c, vir_new_block(c));
+
+ c->output_position_index = -1;
+ c->output_point_size_index = -1;
+ c->output_sample_mask_index = -1;
+
+ c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+ return c;
+}
+
+static void
+v3d_lower_nir(struct v3d_compile *c)
+{
+ struct nir_lower_tex_options tex_options = {
+ .lower_rect = false, /* XXX */
+ .lower_txp = ~0,
+ /* Apply swizzles to all samplers. */
+ .swizzle_result = ~0,
+ };
+
+ /* Lower the format swizzle and (for 32-bit returns)
+ * ARB_texture_swizzle-style swizzle.
+ */
+ for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
+ for (int j = 0; j < 4; j++)
+ tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
+ }
+
+ NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
+}
+
+static void
+v3d_lower_nir_late(struct v3d_compile *c)
+{
+ NIR_PASS_V(c->s, v3d_nir_lower_io, c);
+ NIR_PASS_V(c->s, nir_lower_idiv);
+}
+
+static void
+v3d_set_prog_data_uniforms(struct v3d_compile *c,
+ struct v3d_prog_data *prog_data)
+{
+ int count = c->num_uniforms;
+ struct v3d_uniform_list *ulist = &prog_data->uniforms;
+
+ ulist->count = count;
+ ulist->data = ralloc_array(prog_data, uint32_t, count);
+ memcpy(ulist->data, c->uniform_data,
+ count * sizeof(*ulist->data));
+ ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
+ memcpy(ulist->contents, c->uniform_contents,
+ count * sizeof(*ulist->contents));
+}
+
+/* Copy the compiler UBO range state to the compiled shader, dropping out
+ * arrays that were never referenced by an indirect load.
+ *
+ * (Note that QIR dead code elimination of an array access still leaves that
+ * array alive, though)
+ */
+static void
+v3d_set_prog_data_ubo(struct v3d_compile *c,
+ struct v3d_prog_data *prog_data)
+{
+ if (!c->num_ubo_ranges)
+ return;
+
+ prog_data->num_ubo_ranges = 0;
+ prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
+ c->num_ubo_ranges);
+ for (int i = 0; i < c->num_ubo_ranges; i++) {
+ if (!c->ubo_range_used[i])
+ continue;
+
+ struct v3d_ubo_range *range = &c->ubo_ranges[i];
+ prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
+ prog_data->ubo_size += range->size;
+ }
+
+ if (prog_data->ubo_size) {
+ if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+ fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id,
+ prog_data->ubo_size / 4);
+ }
+ }
+}
+
+static void
+v3d_set_prog_data(struct v3d_compile *c,
+ struct v3d_prog_data *prog_data)
+{
+ v3d_set_prog_data_uniforms(c, prog_data);
+ v3d_set_prog_data_ubo(c, prog_data);
+}
+
+static uint64_t *
+v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
+{
+ *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
+
+ uint64_t *qpu_insts = malloc(*final_assembly_size);
+ if (!qpu_insts)
+ return NULL;
+
+ memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
+
+ vir_compile_destroy(c);
+
+ return qpu_insts;
+}
+
+uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
+ struct v3d_vs_key *key,
+ struct v3d_vs_prog_data *prog_data,
+ nir_shader *s,
+ int program_id, int variant_id,
+ uint32_t *final_assembly_size)
+{
+ struct v3d_compile *c = vir_compile_init(compiler, &key->base, s,
+ program_id, variant_id);
+
+ c->vs_key = key;
+
+ v3d_lower_nir(c);
+
+ if (key->clamp_color)
+ NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
+
+ if (key->base.ucp_enables) {
+ NIR_PASS_V(c->s, nir_lower_clip_vs, key->base.ucp_enables);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar,
+ nir_var_shader_out);
+ }
+
+ /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+
+ v3d_lower_nir_late(c);
+ v3d_optimize_nir(c->s);
+ NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+
+ v3d_nir_to_vir(c);
+
+ v3d_set_prog_data(c, &prog_data->base);
+
+ prog_data->base.num_inputs = c->num_inputs;
+
+ /* The vertex data gets format converted by the VPM so that
+ * each attribute channel takes up a VPM column. Precompute
+ * the sizes for the shader record.
+ */
+ for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
+ prog_data->vattr_sizes[i] = c->vattr_sizes[i];
+ prog_data->vpm_input_size += c->vattr_sizes[i];
+ }
+
+ /* Input/output segment size are in 8x32-bit multiples. */
+ prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
+ prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
+
+ prog_data->uses_vid = (s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID));
+ prog_data->uses_iid = (s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID));
+
+ return v3d_return_qpu_insts(c, final_assembly_size);
+}
+
+static void
+v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
+ struct v3d_fs_prog_data *prog_data)
+{
+ prog_data->base.num_inputs = c->num_inputs;
+ memcpy(prog_data->input_slots, c->input_slots,
+ c->num_inputs * sizeof(*c->input_slots));
+
+ for (int i = 0; i < c->num_inputs; i++) {
+ struct v3d_varying_slot v3d_slot = c->input_slots[i];
+ uint8_t slot = v3d_slot_get_slot(v3d_slot);
+
+ if (slot == VARYING_SLOT_COL0 ||
+ slot == VARYING_SLOT_COL1 ||
+ slot == VARYING_SLOT_BFC0 ||
+ slot == VARYING_SLOT_BFC1) {
+ BITSET_SET(prog_data->color_inputs, i);
+ }
+
+ if (BITSET_TEST(c->flat_shade_flags, i))
+ BITSET_SET(prog_data->flat_shade_flags, i);
+ }
+}
+
+uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler,
+ struct v3d_fs_key *key,
+ struct v3d_fs_prog_data *prog_data,
+ nir_shader *s,
+ int program_id, int variant_id,
+ uint32_t *final_assembly_size)
+{
+ struct v3d_compile *c = vir_compile_init(compiler, &key->base, s,
+ program_id, variant_id);
+
+ c->fs_key = key;
+
+ v3d_lower_nir(c);
+
+ if (key->light_twoside)
+ NIR_PASS_V(c->s, nir_lower_two_sided_color);
+
+ if (key->clamp_color)
+ NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
+
+ if (key->alpha_test) {
+ NIR_PASS_V(c->s, nir_lower_alpha_test, key->alpha_test_func,
+ false);
+ }
+
+ if (key->base.ucp_enables)
+ NIR_PASS_V(c->s, nir_lower_clip_fs, key->base.ucp_enables);
+
+ /* Note: FS input scalarizing must happen after
+ * nir_lower_two_sided_color, which only handles a vec4 at a time.
+ */
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+
+ v3d_lower_nir_late(c);
+ v3d_optimize_nir(c->s);
+ NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+
+ v3d_nir_to_vir(c);
+
+ v3d_set_prog_data(c, &prog_data->base);
+ v3d_set_fs_prog_data_inputs(c, prog_data);
+ if (c->s->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
+ prog_data->writes_z = true;
+
+ return v3d_return_qpu_insts(c, final_assembly_size);
+}
+
+void
+vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
+{
+ if (qinst->dst.file == QFILE_TEMP)
+ c->defs[qinst->dst.index] = NULL;
+
+ list_del(&qinst->link);
+ free(qinst);
+}
+
+struct qreg
+vir_follow_movs(struct v3d_compile *c, struct qreg reg)
+{
+ /* XXX
+ int pack = reg.pack;
+
+ while (reg.file == QFILE_TEMP &&
+ c->defs[reg.index] &&
+ (c->defs[reg.index]->op == QOP_MOV ||
+ c->defs[reg.index]->op == QOP_FMOV) &&
+ !c->defs[reg.index]->dst.pack &&
+ !c->defs[reg.index]->src[0].pack) {
+ reg = c->defs[reg.index]->src[0];
+ }
+
+ reg.pack = pack;
+ */
+ return reg;
+}
+
+void
+vir_compile_destroy(struct v3d_compile *c)
+{
+ vir_for_each_block(block, c) {
+ while (!list_empty(&block->instructions)) {
+ struct qinst *qinst =
+ list_first_entry(&block->instructions,
+ struct qinst, link);
+ vir_remove_instruction(c, qinst);
+ }
+ }
+
+ ralloc_free(c);
+}
+
+struct qreg
+vir_uniform(struct v3d_compile *c,
+ enum quniform_contents contents,
+ uint32_t data)
+{
+ for (int i = 0; i < c->num_uniforms; i++) {
+ if (c->uniform_contents[i] == contents &&
+ c->uniform_data[i] == data) {
+ return vir_reg(QFILE_UNIF, i);
+ }
+ }
+
+ uint32_t uniform = c->num_uniforms++;
+
+ if (uniform >= c->uniform_array_size) {
+ c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
+ c->uniform_array_size * 2);
+
+ c->uniform_data = reralloc(c, c->uniform_data,
+ uint32_t,
+ c->uniform_array_size);
+ c->uniform_contents = reralloc(c, c->uniform_contents,
+ enum quniform_contents,
+ c->uniform_array_size);
+ }
+
+ c->uniform_contents[uniform] = contents;
+ c->uniform_data[uniform] = data;
+
+ return vir_reg(QFILE_UNIF, uniform);
+}
+
+void
+vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
+{
+ struct qinst *last_inst = NULL;
+
+ if (!list_empty(&c->cur_block->instructions))
+ last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+ if (src.file != QFILE_TEMP ||
+ !c->defs[src.index] ||
+ last_inst != c->defs[src.index]) {
+ /* XXX: Make the MOV be the appropriate type */
+ last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
+ last_inst = (struct qinst *)c->cur_block->instructions.prev;
+ }
+
+ vir_set_pf(last_inst, pf);
+}
+
+#define OPTPASS(func) \
+ do { \
+ bool stage_progress = func(c); \
+ if (stage_progress) { \
+ progress = true; \
+ if (print_opt_debug) { \
+ fprintf(stderr, \
+ "VIR opt pass %2d: %s progress\n", \
+ pass, #func); \
+ } \
+ /*XXX vir_validate(c);*/ \
+ } \
+ } while (0)
+
+void
+vir_optimize(struct v3d_compile *c)
+{
+ bool print_opt_debug = false;
+ int pass = 1;
+
+ while (true) {
+ bool progress = false;
+
+ OPTPASS(vir_opt_copy_propagate);
+ OPTPASS(vir_opt_dead_code);
+
+ if (!progress)
+ break;
+
+ pass++;
+ }
+}
+
+const char *
+vir_get_stage_name(struct v3d_compile *c)
+{
+ if (c->vs_key && c->vs_key->is_coord)
+ return "MESA_SHADER_COORD";
+ else
+ return gl_shader_stage_name(c->s->stage);
+}
--- /dev/null
+/*
+ * Copyright © 2016-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+static void
+vir_print_reg(struct v3d_compile *c, struct qreg reg)
+{
+ static const char *files[] = {
+ [QFILE_TEMP] = "t",
+ [QFILE_VARY] = "v",
+ [QFILE_UNIF] = "u",
+ [QFILE_TLB] = "tlb",
+ [QFILE_TLBU] = "tlbu",
+ };
+ static const char *quniform_names[] = {
+ [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale",
+ [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale",
+ [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset",
+ [QUNIFORM_VIEWPORT_Z_SCALE] = "vp_z_scale",
+ };
+
+ switch (reg.file) {
+
+ case QFILE_NULL:
+ fprintf(stderr, "null");
+ break;
+
+ case QFILE_LOAD_IMM:
+ fprintf(stderr, "0x%08x (%f)", reg.index, uif(reg.index));
+ break;
+
+ case QFILE_REG:
+ fprintf(stderr, "rf%d", reg.index);
+ break;
+
+ case QFILE_MAGIC:
+ fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
+ break;
+
+ case QFILE_SMALL_IMM:
+ if ((int)reg.index >= -16 && (int)reg.index <= 15)
+ fprintf(stderr, "%d", reg.index);
+ else
+ fprintf(stderr, "%f", uif(reg.index));
+ break;
+
+ case QFILE_VPM:
+ fprintf(stderr, "vpm%d.%d",
+ reg.index / 4, reg.index % 4);
+ break;
+
+ case QFILE_TLB:
+ fprintf(stderr, "%s", files[reg.file]);
+ break;
+
+ case QFILE_UNIF: {
+ enum quniform_contents contents = c->uniform_contents[reg.index];
+
+ fprintf(stderr, "%s%d", files[reg.file], reg.index);
+
+ switch (contents) {
+ case QUNIFORM_CONSTANT:
+ fprintf(stderr, " (0x%08x / %f)",
+ c->uniform_data[reg.index],
+ uif(c->uniform_data[reg.index]));
+ break;
+
+ case QUNIFORM_UNIFORM:
+ fprintf(stderr, " (push[%d])",
+ c->uniform_data[reg.index]);
+ break;
+
+ case QUNIFORM_TEXTURE_CONFIG_P1:
+ fprintf(stderr, " (tex[%d].p1)",
+ c->uniform_data[reg.index]);
+ break;
+
+ case QUNIFORM_TEXTURE_WIDTH:
+ fprintf(stderr, " (tex[%d].width)",
+ c->uniform_data[reg.index]);
+ break;
+ case QUNIFORM_TEXTURE_HEIGHT:
+ fprintf(stderr, " (tex[%d].height)",
+ c->uniform_data[reg.index]);
+ break;
+ case QUNIFORM_TEXTURE_DEPTH:
+ fprintf(stderr, " (tex[%d].depth)",
+ c->uniform_data[reg.index]);
+ break;
+ case QUNIFORM_TEXTURE_ARRAY_SIZE:
+ fprintf(stderr, " (tex[%d].array_size)",
+ c->uniform_data[reg.index]);
+ break;
+ case QUNIFORM_TEXTURE_LEVELS:
+ fprintf(stderr, " (tex[%d].levels)",
+ c->uniform_data[reg.index]);
+ break;
+
+ case QUNIFORM_UBO_ADDR:
+ fprintf(stderr, " (ubo[%d])",
+ c->uniform_data[reg.index]);
+ break;
+
+ default:
+ if (quniform_contents_is_texture_p0(contents)) {
+ fprintf(stderr, " (tex[%d].p0: 0x%08x)",
+ contents - QUNIFORM_TEXTURE_CONFIG_P0_0,
+ c->uniform_data[reg.index]);
+ } else if (contents < ARRAY_SIZE(quniform_names)) {
+ fprintf(stderr, " (%s)",
+ quniform_names[contents]);
+ } else {
+ fprintf(stderr, " (%d / 0x%08x)", contents,
+ c->uniform_data[reg.index]);
+ }
+ }
+
+ break;
+ }
+
+ default:
+ fprintf(stderr, "%s%d", files[reg.file], reg.index);
+ break;
+ }
+}
+
+static void
+vir_dump_sig(struct v3d_compile *c, struct qinst *inst)
+{
+ struct v3d_qpu_sig *sig = &inst->qpu.sig;
+
+ if (sig->thrsw)
+ fprintf(stderr, "; thrsw");
+ if (sig->ldvary)
+ fprintf(stderr, "; ldvary");
+ if (sig->ldvpm)
+ fprintf(stderr, "; ldvpm");
+ if (sig->ldtmu)
+ fprintf(stderr, "; ldtmu");
+ if (sig->ldunif)
+ fprintf(stderr, "; ldunif");
+ if (sig->wrtmuc)
+ fprintf(stderr, "; wrtmuc");
+}
+
+static void
+vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+{
+ struct v3d_qpu_instr *instr = &inst->qpu;
+ int nsrc = vir_get_non_sideband_nsrc(inst);
+ int sideband_nsrc = vir_get_nsrc(inst);
+ enum v3d_qpu_input_unpack unpack[2];
+
+ if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+ fprintf(stderr, "%s", v3d_qpu_add_op_name(instr->alu.add.op));
+ fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.ac));
+ fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.apf));
+ fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
+ fprintf(stderr, " ");
+
+ vir_print_reg(c, inst->dst);
+ fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
+
+ unpack[0] = instr->alu.add.a_unpack;
+ unpack[1] = instr->alu.add.b_unpack;
+ } else {
+ fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
+ fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
+ fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.mpf));
+ fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
+ fprintf(stderr, " ");
+
+ vir_print_reg(c, inst->dst);
+ fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
+
+ unpack[0] = instr->alu.mul.a_unpack;
+ unpack[1] = instr->alu.mul.b_unpack;
+ }
+
+ for (int i = 0; i < sideband_nsrc; i++) {
+ fprintf(stderr, ", ");
+ vir_print_reg(c, inst->src[i]);
+ if (i < nsrc)
+ fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
+ }
+
+ vir_dump_sig(c, inst);
+}
+
+void
+vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
+{
+ struct v3d_qpu_instr *instr = &inst->qpu;
+
+ switch (inst->qpu.type) {
+ case V3D_QPU_INSTR_TYPE_ALU:
+ vir_dump_alu(c, inst);
+ break;
+ case V3D_QPU_INSTR_TYPE_BRANCH:
+ fprintf(stderr, "b");
+ if (instr->branch.ub)
+ fprintf(stderr, "u");
+
+ fprintf(stderr, "%s",
+ v3d_qpu_branch_cond_name(instr->branch.cond));
+ fprintf(stderr, "%s", v3d_qpu_msfign_name(instr->branch.msfign));
+
+ switch (instr->branch.bdi) {
+ case V3D_QPU_BRANCH_DEST_ABS:
+ fprintf(stderr, " zero_addr+0x%08x", instr->branch.offset);
+ break;
+
+ case V3D_QPU_BRANCH_DEST_REL:
+ fprintf(stderr, " %d", instr->branch.offset);
+ break;
+
+ case V3D_QPU_BRANCH_DEST_LINK_REG:
+ fprintf(stderr, " lri");
+ break;
+
+ case V3D_QPU_BRANCH_DEST_REGFILE:
+ fprintf(stderr, " rf%d", instr->branch.raddr_a);
+ break;
+ }
+
+ if (instr->branch.ub) {
+ switch (instr->branch.bdu) {
+ case V3D_QPU_BRANCH_DEST_ABS:
+ fprintf(stderr, ", a:unif");
+ break;
+
+ case V3D_QPU_BRANCH_DEST_REL:
+ fprintf(stderr, ", r:unif");
+ break;
+
+ case V3D_QPU_BRANCH_DEST_LINK_REG:
+ fprintf(stderr, ", lri");
+ break;
+
+ case V3D_QPU_BRANCH_DEST_REGFILE:
+ fprintf(stderr, ", rf%d", instr->branch.raddr_a);
+ break;
+ }
+ }
+
+ if (vir_has_implicit_uniform(inst)) {
+ fprintf(stderr, " ");
+ vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]);
+ }
+
+ break;
+ }
+}
+
+void
+vir_dump(struct v3d_compile *c)
+{
+ int ip = 0;
+
+ vir_for_each_block(block, c) {
+ fprintf(stderr, "BLOCK %d:\n", block->index);
+ vir_for_each_inst(inst, block) {
+ if (c->temp_start) {
+ bool first = true;
+
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] != ip)
+ continue;
+
+ if (first) {
+ first = false;
+ } else {
+ fprintf(stderr, ", ");
+ }
+ fprintf(stderr, "S%4d", i);
+ }
+
+ if (first)
+ fprintf(stderr, " ");
+ else
+ fprintf(stderr, " ");
+ }
+
+ if (c->temp_end) {
+ bool first = true;
+
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_end[i] != ip)
+ continue;
+
+ if (first) {
+ first = false;
+ } else {
+ fprintf(stderr, ", ");
+ }
+ fprintf(stderr, "E%4d", i);
+ }
+
+ if (first)
+ fprintf(stderr, " ");
+ else
+ fprintf(stderr, " ");
+ }
+
+ vir_dump_inst(c, inst);
+ fprintf(stderr, "\n");
+ ip++;
+ }
+ if (block->successors[1]) {
+ fprintf(stderr, "-> BLOCK %d, %d\n",
+ block->successors[0]->index,
+ block->successors[1]->index);
+ } else if (block->successors[0]) {
+ fprintf(stderr, "-> BLOCK %d\n",
+ block->successors[0]->index);
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define MAX_INSTRUCTION (1 << 30)
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "v3d_compiler.h"
+
+struct partial_update_state {
+ struct qinst *insts[4];
+ uint8_t channels;
+};
+
+static uint32_t
+int_hash(const void *key)
+{
+ return _mesa_hash_data(key, sizeof(int));
+}
+
+static bool
+int_compare(const void *key1, const void *key2)
+{
+ return *(const int *)key1 == *(const int *)key2;
+}
+
+static int
+vir_reg_to_var(struct qreg reg)
+{
+ if (reg.file == QFILE_TEMP)
+ return reg.index;
+
+ return -1;
+}
+
+static void
+vir_setup_use(struct v3d_compile *c, struct qblock *block, int ip,
+ struct qreg src)
+{
+ int var = vir_reg_to_var(src);
+ if (var == -1)
+ return;
+
+ c->temp_start[var] = MIN2(c->temp_start[var], ip);
+ c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+ /* The use[] bitset marks when the block makes
+ * use of a variable without having completely
+ * defined that variable within the block.
+ */
+ if (!BITSET_TEST(block->def, var))
+ BITSET_SET(block->use, var);
+}
+
+static struct partial_update_state *
+get_partial_update_state(struct hash_table *partial_update_ht,
+ struct qinst *inst)
+{
+ struct hash_entry *entry =
+ _mesa_hash_table_search(partial_update_ht,
+ &inst->dst.index);
+ if (entry)
+ return entry->data;
+
+ struct partial_update_state *state =
+ rzalloc(partial_update_ht, struct partial_update_state);
+
+ _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state);
+
+ return state;
+}
+
+static void
+vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
+ struct hash_table *partial_update_ht, struct qinst *inst)
+{
+ if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+ return;
+
+ /* The def[] bitset marks when an initialization in a
+ * block completely screens off previous updates of
+ * that variable.
+ */
+ int var = vir_reg_to_var(inst->dst);
+ if (var == -1)
+ return;
+
+ c->temp_start[var] = MIN2(c->temp_start[var], ip);
+ c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+ /* If we've already tracked this as a def, or already used it within
+ * the block, there's nothing to do.
+ */
+ if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
+ return;
+
+ /* Easy, common case: unconditional full register update.
+ *
+ * We treat conditioning on the exec mask as the same as not being
+ * conditional. This makes sure that if the register gets set on
+ * either side of an if, it is treated as being screened off before
+ * the if. Otherwise, if there was no intervening def, its live
+ * interval doesn't extend back to the start of he program, and if too
+ * many registers did that we'd fail to register allocate.
+ */
+ if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+ inst->qpu.flags.mc == V3D_QPU_COND_NONE) ||
+ inst->cond_is_exec_mask) &&
+ inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE &&
+ inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) {
+ BITSET_SET(block->def, var);
+ return;
+ }
+
+ /* Finally, look at the condition code and packing and mark it as a
+ * def. We need to make sure that we understand sequences
+ * instructions like:
+ *
+ * mov.zs t0, t1
+ * mov.zc t0, t2
+ *
+ * or:
+ *
+ * mmov t0.8a, t1
+ * mmov t0.8b, t2
+ * mmov t0.8c, t3
+ * mmov t0.8d, t4
+ *
+ * as defining the temp within the block, because otherwise dst's live
+ * range will get extended up the control flow to the top of the
+ * program.
+ */
+ struct partial_update_state *state =
+ get_partial_update_state(partial_update_ht, inst);
+ uint8_t mask = 0xf; /* XXX vir_channels_written(inst); */
+
+ if (inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+ inst->qpu.flags.mc == V3D_QPU_COND_NONE) {
+ state->channels |= mask;
+ } else {
+ for (int i = 0; i < 4; i++) {
+ if (!(mask & (1 << i)))
+ continue;
+
+ /* XXXif (state->insts[i] &&
+ state->insts[i]->cond ==
+ qpu_cond_complement(inst->cond))
+ state->channels |= 1 << i;
+ else
+ */
+ state->insts[i] = inst;
+ }
+ }
+
+ if (state->channels == 0xf)
+ BITSET_SET(block->def, var);
+}
+
+static void
+sf_state_clear(struct hash_table *partial_update_ht)
+{
+ struct hash_entry *entry;
+
+ hash_table_foreach(partial_update_ht, entry) {
+ struct partial_update_state *state = entry->data;
+
+ for (int i = 0; i < 4; i++) {
+ if (state->insts[i] &&
+ (state->insts[i]->qpu.flags.ac != V3D_QPU_COND_NONE ||
+ state->insts[i]->qpu.flags.mc != V3D_QPU_COND_NONE))
+ state->insts[i] = NULL;
+ }
+ }
+}
+
+/* Sets up the def/use arrays for when variables are used-before-defined or
+ * defined-before-used in the block.
+ *
+ * Also initializes the temp_start/temp_end to cover just the instruction IPs
+ * where the variable is used, which will be extended later in
+ * vir_compute_start_end().
+ */
+static void
+vir_setup_def_use(struct v3d_compile *c)
+{
+ struct hash_table *partial_update_ht =
+ _mesa_hash_table_create(c, int_hash, int_compare);
+ int ip = 0;
+
+ vir_for_each_block(block, c) {
+ block->start_ip = ip;
+
+ _mesa_hash_table_clear(partial_update_ht, NULL);
+
+ vir_for_each_inst(inst, block) {
+ for (int i = 0; i < vir_get_nsrc(inst); i++)
+ vir_setup_use(c, block, ip, inst->src[i]);
+
+ vir_setup_def(c, block, ip, partial_update_ht, inst);
+
+ if (false /* XXX inst->uf */)
+ sf_state_clear(partial_update_ht);
+
+ /* Payload registers: r0/1/2 contain W, centroid W,
+ * and Z at program start. Register allocation will
+ * force their nodes to R0/1/2.
+ */
+ if (inst->src[0].file == QFILE_REG) {
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ c->temp_start[inst->dst.index] = 0;
+ break;
+ }
+ }
+
+ ip++;
+ }
+ block->end_ip = ip;
+ }
+
+ _mesa_hash_table_destroy(partial_update_ht, NULL);
+}
+
+static bool
+vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words)
+{
+ bool cont = false;
+
+ vir_for_each_block_rev(block, c) {
+ /* Update live_out: Any successor using the variable
+ * on entrance needs us to have the variable live on
+ * exit.
+ */
+ vir_for_each_successor(succ, block) {
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_live_out = (succ->live_in[i] &
+ ~block->live_out[i]);
+ if (new_live_out) {
+ block->live_out[i] |= new_live_out;
+ cont = true;
+ }
+ }
+ }
+
+ /* Update live_in */
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_live_in = (block->use[i] |
+ (block->live_out[i] &
+ ~block->def[i]));
+ if (new_live_in & ~block->live_in[i]) {
+ block->live_in[i] |= new_live_in;
+ cont = true;
+ }
+ }
+ }
+
+ return cont;
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+static void
+vir_compute_start_end(struct v3d_compile *c, int num_vars)
+{
+ vir_for_each_block(block, c) {
+ for (int i = 0; i < num_vars; i++) {
+ if (BITSET_TEST(block->live_in, i)) {
+ c->temp_start[i] = MIN2(c->temp_start[i],
+ block->start_ip);
+ c->temp_end[i] = MAX2(c->temp_end[i],
+ block->start_ip);
+ }
+
+ if (BITSET_TEST(block->live_out, i)) {
+ c->temp_start[i] = MIN2(c->temp_start[i],
+ block->end_ip);
+ c->temp_end[i] = MAX2(c->temp_end[i],
+ block->end_ip);
+ }
+ }
+ }
+}
+
+void
+vir_calculate_live_intervals(struct v3d_compile *c)
+{
+ int bitset_words = BITSET_WORDS(c->num_temps);
+
+ /* If we called this function more than once, then we should be
+ * freeing the previous arrays.
+ */
+ assert(!c->temp_start);
+
+ c->temp_start = rzalloc_array(c, int, c->num_temps);
+ c->temp_end = rzalloc_array(c, int, c->num_temps);
+
+ for (int i = 0; i < c->num_temps; i++) {
+ c->temp_start[i] = MAX_INSTRUCTION;
+ c->temp_end[i] = -1;
+ }
+
+ vir_for_each_block(block, c) {
+ block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
+ }
+
+ vir_setup_def_use(c);
+
+ while (vir_live_variables_dataflow(c, bitset_words))
+ ;
+
+ vir_compute_start_end(c, c->num_temps);
+}
--- /dev/null
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_vir_lower_uniforms.c
+ *
+ * This is the pre-code-generation pass for fixing up instructions that try to
+ * read from multiple uniform values.
+ */
+
+#include "v3d_compiler.h"
+#include "util/hash_table.h"
+#include "util/u_math.h"
+
+static inline uint32_t
+index_hash(const void *key)
+{
+ return (uintptr_t)key;
+}
+
+static inline bool
+index_compare(const void *a, const void *b)
+{
+ return a == b;
+}
+
+static void
+add_uniform(struct hash_table *ht, struct qreg reg)
+{
+ struct hash_entry *entry;
+ void *key = (void *)(uintptr_t)(reg.index + 1);
+
+ entry = _mesa_hash_table_search(ht, key);
+ if (entry) {
+ entry->data++;
+ } else {
+ _mesa_hash_table_insert(ht, key, (void *)(uintptr_t)1);
+ }
+}
+
+static void
+remove_uniform(struct hash_table *ht, struct qreg reg)
+{
+ struct hash_entry *entry;
+ void *key = (void *)(uintptr_t)(reg.index + 1);
+
+ entry = _mesa_hash_table_search(ht, key);
+ assert(entry);
+ entry->data--;
+ if (entry->data == NULL)
+ _mesa_hash_table_remove(ht, entry);
+}
+
+static bool
+is_lowerable_uniform(struct qinst *inst, int i)
+{
+ if (inst->src[i].file != QFILE_UNIF)
+ return false;
+ if (vir_has_implicit_uniform(inst))
+ return i != vir_get_implicit_uniform_src(inst);
+ return true;
+}
+
+/* Returns the number of different uniform values referenced by the
+ * instruction.
+ */
+static uint32_t
+vir_get_instruction_uniform_count(struct qinst *inst)
+{
+ uint32_t count = 0;
+
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_UNIF)
+ continue;
+
+ bool is_duplicate = false;
+ for (int j = 0; j < i; j++) {
+ if (inst->src[j].file == QFILE_UNIF &&
+ inst->src[j].index == inst->src[i].index) {
+ is_duplicate = true;
+ break;
+ }
+ }
+ if (!is_duplicate)
+ count++;
+ }
+
+ return count;
+}
+
+void
+vir_lower_uniforms(struct v3d_compile *c)
+{
+ struct hash_table *ht =
+ _mesa_hash_table_create(c, index_hash, index_compare);
+
+ /* Walk the instruction list, finding which instructions have more
+ * than one uniform referenced, and add those uniform values to the
+ * ht.
+ */
+ vir_for_each_inst_inorder(inst, c) {
+ uint32_t nsrc = vir_get_nsrc(inst);
+
+ if (vir_get_instruction_uniform_count(inst) <= 1)
+ continue;
+
+ for (int i = 0; i < nsrc; i++) {
+ if (is_lowerable_uniform(inst, i))
+ add_uniform(ht, inst->src[i]);
+ }
+ }
+
+ while (ht->entries) {
+ /* Find the most commonly used uniform in instructions that
+ * need a uniform lowered.
+ */
+ uint32_t max_count = 0;
+ uint32_t max_index = 0;
+ struct hash_entry *entry;
+ hash_table_foreach(ht, entry) {
+ uint32_t count = (uintptr_t)entry->data;
+ uint32_t index = (uintptr_t)entry->key - 1;
+ if (count > max_count) {
+ max_count = count;
+ max_index = index;
+ }
+ }
+
+ struct qreg unif = vir_reg(QFILE_UNIF, max_index);
+
+ /* Now, find the instructions using this uniform and make them
+ * reference a temp instead.
+ */
+ vir_for_each_block(block, c) {
+ struct qinst *mov = NULL;
+
+ vir_for_each_inst(inst, block) {
+ uint32_t nsrc = vir_get_nsrc(inst);
+
+ uint32_t count = vir_get_instruction_uniform_count(inst);
+
+ if (count <= 1)
+ continue;
+
+ /* If the block doesn't have a load of the
+ * uniform yet, add it. We could potentially
+ * do better and CSE MOVs from multiple blocks
+ * into dominating blocks, except that may
+ * cause troubles for register allocation.
+ */
+ if (!mov) {
+ mov = vir_mul_inst(V3D_QPU_M_MOV,
+ vir_get_temp(c),
+ unif, c->undef);
+ list_add(&mov->link,
+ &block->instructions);
+ c->defs[mov->dst.index] = mov;
+ }
+
+ bool removed = false;
+ for (int i = 0; i < nsrc; i++) {
+ if (is_lowerable_uniform(inst, i) &&
+ inst->src[i].index == max_index) {
+ inst->src[i].file =
+ mov->dst.file;
+ inst->src[i].index =
+ mov->dst.index;
+ remove_uniform(ht, unif);
+ removed = true;
+ }
+ }
+ if (removed)
+ count--;
+
+ /* If the instruction doesn't need lowering any more,
+ * then drop it from the list.
+ */
+ if (count <= 1) {
+ for (int i = 0; i < nsrc; i++) {
+ if (is_lowerable_uniform(inst, i))
+ remove_uniform(ht, inst->src[i]);
+ }
+ }
+ }
+ }
+ }
+
+ _mesa_hash_table_destroy(ht, NULL);
+}
--- /dev/null
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_copy_propagation.c
+ *
+ * This implements simple copy propagation for VIR without control flow.
+ *
+ * For each temp, it keeps a qreg of which source it was MOVed from, if it
+ * was. If we see that used later, we can just reuse the source value, since
+ * we know we don't have control flow, and we have SSA for our values so
+ * there's no killing to worry about.
+ */
+
+#include "v3d_compiler.h"
+
+static bool
+is_copy_mov(struct qinst *inst)
+{
+ if (!inst)
+ return false;
+
+ if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
+ inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
+ return false;
+ }
+
+ if (inst->dst.file != QFILE_TEMP)
+ return false;
+
+ if (inst->src[0].file != QFILE_TEMP &&
+ inst->src[0].file != QFILE_UNIF) {
+ return false;
+ }
+
+ if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
+ return false;
+ }
+
+ if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+ inst->qpu.flags.mc != V3D_QPU_COND_NONE) {
+ return false;
+ }
+
+ switch (inst->src[0].file) {
+ case QFILE_MAGIC:
+ /* No copy propagating from R3/R4/R5 -- the MOVs from those
+ * are there to register allocate values produced into R3/4/5
+ * to other regs (though hopefully r3/4/5).
+ */
+ switch (inst->src[0].index) {
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ return false;
+ default:
+ break;
+ }
+ break;
+
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* MOVs from rf0/1/2 are only to track the live
+ * intervals for W/centroid W/Z.
+ */
+ return false;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static bool
+vir_has_unpack(struct qinst *inst, int chan)
+{
+ assert(chan == 0 || chan == 1);
+
+ if (vir_is_add(inst)) {
+ if (chan == 0)
+ return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+ else
+ return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+ } else {
+ if (chan == 0)
+ return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+ else
+ return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+ }
+}
+
+static bool
+try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+{
+ bool debug = false;
+ bool progress = false;
+
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP)
+ continue;
+
+ /* We have two ways of finding MOVs we can copy propagate
+ * from. One is if it's an SSA def: then we can reuse it from
+ * any block in the program, as long as its source is also an
+ * SSA def. Alternatively, if it's in the "movs" array
+ * tracked within the block, then we know the sources for it
+ * haven't been changed since we saw the instruction within
+ * our block.
+ */
+ struct qinst *mov = movs[inst->src[i].index];
+ if (!mov) {
+ if (!is_copy_mov(c->defs[inst->src[i].index]))
+ continue;
+ mov = c->defs[inst->src[i].index];
+
+ if (mov->src[0].file == QFILE_TEMP &&
+ !c->defs[mov->src[0].index])
+ continue;
+ }
+
+ if (vir_has_unpack(mov, 0)) {
+ /* Make sure that the meaning of the unpack
+ * would be the same between the two
+ * instructions.
+ */
+ if (vir_is_float_input(inst) !=
+ vir_is_float_input(mov)) {
+ continue;
+ }
+ /* No composing the unpacks. */
+ if (vir_has_unpack(inst, i))
+ continue;
+ }
+
+ if (debug) {
+ fprintf(stderr, "Copy propagate: ");
+ vir_dump_inst(c, inst);
+ fprintf(stderr, "\n");
+ }
+
+ inst->src[i] = mov->src[0];
+ if (vir_has_unpack(mov, 0)) {
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+
+ vir_set_unpack(inst, i, unpack);
+ }
+
+ if (debug) {
+ fprintf(stderr, "to: ");
+ vir_dump_inst(c, inst);
+ fprintf(stderr, "\n");
+ }
+
+ progress = true;
+ }
+
+ return progress;
+}
+
+static void
+apply_kills(struct v3d_compile *c, struct qinst **movs, struct qinst *inst)
+{
+ if (inst->dst.file != QFILE_TEMP)
+ return;
+
+ for (int i = 0; i < c->num_temps; i++) {
+ if (movs[i] &&
+ (movs[i]->dst.index == inst->dst.index ||
+ (movs[i]->src[0].file == QFILE_TEMP &&
+ movs[i]->src[0].index == inst->dst.index))) {
+ movs[i] = NULL;
+ }
+ }
+}
+
+bool
+vir_opt_copy_propagate(struct v3d_compile *c)
+{
+ bool progress = false;
+ struct qinst **movs;
+
+ movs = ralloc_array(c, struct qinst *, c->num_temps);
+ if (!movs)
+ return false;
+
+ vir_for_each_block(block, c) {
+ /* The MOVs array tracks only available movs within the
+ * block.
+ */
+ memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
+
+ vir_for_each_inst(inst, block) {
+ progress = try_copy_prop(c, inst, movs) || progress;
+
+ apply_kills(c, movs, inst);
+
+ if (is_copy_mov(inst))
+ movs[inst->dst.index] = inst;
+ }
+ }
+
+ ralloc_free(movs);
+
+ return progress;
+}
--- /dev/null
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_dead_code.c
+ *
+ * This is a simple dead code eliminator for SSA values in VIR.
+ *
+ * It walks all the instructions finding what temps are used, then walks again
+ * to remove instructions writing unused temps.
+ *
+ * This is an inefficient implementation if you have long chains of
+ * instructions where the entire chain is dead, but we expect those to have
+ * been eliminated at the NIR level, and here we're just cleaning up small
+ * problems produced by NIR->VIR.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+static void
+dce(struct v3d_compile *c, struct qinst *inst)
+{
+ if (debug) {
+ fprintf(stderr, "Removing: ");
+ vir_dump_inst(c, inst);
+ fprintf(stderr, "\n");
+ }
+ assert(inst->qpu.flags.apf == V3D_QPU_PF_NONE);
+ assert(inst->qpu.flags.mpf == V3D_QPU_PF_NONE);
+ vir_remove_instruction(c, inst);
+}
+
+static bool
+has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
+{
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file == QFILE_VPM) {
+ /* Instance ID, Vertex ID: Should have been removed at
+ * the NIR level
+ */
+ if (inst->src[i].index == ~0)
+ return true;
+
+ uint32_t attr = inst->src[i].index / 4;
+ uint32_t offset = inst->src[i].index % 4;
+
+ if (c->vattr_sizes[attr] != offset)
+ return true;
+
+ /* Can't get rid of the last VPM read, or the
+ * simulator (at least) throws an error.
+ */
+ uint32_t total_size = 0;
+ for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
+ total_size += c->vattr_sizes[i];
+ if (total_size == 1)
+ return true;
+ }
+
+ /* Dead code removal of varyings is tricky, so just assert
+ * that it all happened at the NIR level.
+ */
+ if (inst->src[i].file == QFILE_VARY)
+ return true;
+ }
+
+ return false;
+}
+
+bool
+vir_opt_dead_code(struct v3d_compile *c)
+{
+ bool progress = false;
+ bool *used = calloc(c->num_temps, sizeof(bool));
+
+ vir_for_each_inst_inorder(inst, c) {
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file == QFILE_TEMP)
+ used[inst->src[i].index] = true;
+ }
+ }
+
+ vir_for_each_block(block, c) {
+ vir_for_each_inst_safe(inst, block) {
+ if (inst->dst.file != QFILE_NULL &&
+ !(inst->dst.file == QFILE_TEMP &&
+ !used[inst->dst.index])) {
+ continue;
+ }
+
+ if (vir_has_side_effects(c, inst))
+ continue;
+
+ if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
+ inst->qpu.flags.mpf != V3D_QPU_PF_NONE||
+ has_nonremovable_reads(c, inst)) {
+ /* If we can't remove the instruction, but we
+ * don't need its destination value, just
+ * remove the destination. The register
+ * allocator would trivially color it and it
+ * wouldn't cause any register pressure, but
+ * it's nicer to read the VIR code without
+ * unused destination regs.
+ */
+ if (inst->dst.file == QFILE_TEMP) {
+ if (debug) {
+ fprintf(stderr,
+ "Removing dst from: ");
+ vir_dump_inst(c, inst);
+ fprintf(stderr, "\n");
+ }
+ c->defs[inst->dst.index] = NULL;
+ inst->dst.file = QFILE_NULL;
+ progress = true;
+ }
+ continue;
+ }
+
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_VPM)
+ continue;
+ uint32_t attr = inst->src[i].index / 4;
+ uint32_t offset = (inst->src[i].index % 4);
+
+ if (c->vattr_sizes[attr] == offset) {
+ c->num_inputs--;
+ c->vattr_sizes[attr]--;
+ }
+ }
+
+ dce(c, inst);
+ progress = true;
+ continue;
+ }
+ }
+
+ free(used);
+
+ return progress;
+}
--- /dev/null
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "v3d_compiler.h"
+
+#define QPU_R(i) { .magic = false, .index = i }
+
+#define ACC_INDEX 0
+#define ACC_COUNT 5
+#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
+#define PHYS_COUNT 64
+
+bool
+vir_init_reg_sets(struct v3d_compiler *compiler)
+{
+ compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+ true);
+ if (!compiler->regs)
+ return false;
+
+ /* Allocate 3 regfile classes, for the ways the physical register file
+ * can be divided up for fragment shader threading.
+ */
+ for (int threads = 0; threads < 3; threads++) {
+ compiler->reg_class[threads] =
+ ra_alloc_reg_class(compiler->regs);
+
+ for (int i = PHYS_INDEX;
+ i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class[threads], i);
+ }
+
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class[threads], i);
+ }
+ }
+
+ ra_set_finalize(compiler->regs, NULL);
+
+ return true;
+}
+
+struct node_to_temp_map {
+ uint32_t temp;
+ uint32_t priority;
+};
+
+static int
+node_to_temp_priority(const void *in_a, const void *in_b)
+{
+ const struct node_to_temp_map *a = in_a;
+ const struct node_to_temp_map *b = in_b;
+
+ return a->priority - b->priority;
+}
+
+#define CLASS_BIT_PHYS (1 << 0)
+#define CLASS_BIT_R0_R2 (1 << 1)
+#define CLASS_BIT_R3 (1 << 2)
+#define CLASS_BIT_R4 (1 << 3)
+
+/**
+ * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
+ *
+ * The return value should be freed by the caller.
+ */
+struct qpu_reg *
+v3d_register_allocate(struct v3d_compile *c)
+{
+ struct node_to_temp_map map[c->num_temps];
+ uint32_t temp_to_node[c->num_temps];
+ uint8_t class_bits[c->num_temps];
+ struct qpu_reg *temp_registers = calloc(c->num_temps,
+ sizeof(*temp_registers));
+ int acc_nodes[ACC_COUNT];
+
+ struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
+ c->num_temps +
+ ARRAY_SIZE(acc_nodes));
+
+ /* Make some fixed nodes for the accumulators, which we will need to
+ * interfere with when ops have implied r3/r4 writes or for the thread
+ * switches. We could represent these as classes for the nodes to
+ * live in, but the classes take up a lot of memory to set up, so we
+ * don't want to make too many.
+ */
+ for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
+ acc_nodes[i] = c->num_temps + i;
+ ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
+ }
+
+ /* Compute the live ranges so we can figure out interference. */
+ vir_calculate_live_intervals(c);
+
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ map[i].temp = i;
+ map[i].priority = c->temp_end[i] - c->temp_start[i];
+ }
+ qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ temp_to_node[map[i].temp] = i;
+ }
+
+ /* Figure out our register classes and preallocated registers. We
+ * start with any temp being able to be in any file, then instructions
+ * incrementally remove bits that the temp definitely can't be in.
+ */
+ memset(class_bits,
+ CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
+ sizeof(class_bits));
+
+ int ip = 0;
+ vir_for_each_inst_inorder(inst, c) {
+ /* If the instruction writes r3/r4 (and optionally moves its
+ * result to a temp), nothing else can be stored in r3/r4 across
+ * it.
+ */
+ if (vir_writes_r3(inst)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > ip) {
+ ra_add_node_interference(g,
+ temp_to_node[i],
+ acc_nodes[3]);
+ }
+ }
+ }
+ if (vir_writes_r4(inst)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > ip) {
+ ra_add_node_interference(g,
+ temp_to_node[i],
+ acc_nodes[4]);
+ }
+ }
+ }
+
+ if (inst->src[0].file == QFILE_REG) {
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* Payload setup instructions: Force allocate
+ * the dst to the given register (so the MOV
+ * will disappear).
+ */
+ assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+ assert(inst->dst.file == QFILE_TEMP);
+ ra_set_node_reg(g,
+ temp_to_node[inst->dst.index],
+ PHYS_INDEX +
+ inst->src[0].index);
+ break;
+ }
+ }
+
+#if 0
+ switch (inst->op) {
+ case QOP_THRSW:
+ /* All accumulators are invalidated across a thread
+ * switch.
+ */
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip)
+ class_bits[i] &= ~(CLASS_BIT_R0_R3 |
+ CLASS_BIT_R4);
+ }
+ break;
+
+ default:
+ break;
+ }
+#endif
+
+ ip++;
+ }
+
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class[c->fs_threaded]);
+ }
+
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ for (uint32_t j = i + 1; j < c->num_temps; j++) {
+ if (!(c->temp_start[i] >= c->temp_end[j] ||
+ c->temp_start[j] >= c->temp_end[i])) {
+ ra_add_node_interference(g,
+ temp_to_node[i],
+ temp_to_node[j]);
+ }
+ }
+ }
+
+ bool ok = ra_allocate(g);
+ if (!ok) {
+ if (!c->fs_threaded) {
+ fprintf(stderr, "Failed to register allocate:\n");
+ vir_dump(c);
+ }
+
+ c->failed = true;
+ free(temp_registers);
+ return NULL;
+ }
+
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
+ if (ra_reg < PHYS_INDEX) {
+ temp_registers[i].magic = true;
+ temp_registers[i].index = (V3D_QPU_WADDR_R0 +
+ ra_reg - ACC_INDEX);
+ } else {
+ temp_registers[i].magic = false;
+ temp_registers[i].index = ra_reg - PHYS_INDEX;
+ }
+
+ /* If the value's never used, just write to the NOP register
+ * for clarity in debug output.
+ */
+ if (c->temp_start[i] == c->temp_end[i]) {
+ temp_registers[i].magic = true;
+ temp_registers[i].index = V3D_QPU_WADDR_NOP;
+ }
+ }
+
+ ralloc_free(g);
+
+ return temp_registers;
+}
--- /dev/null
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "qpu/qpu_instr.h"
+#include "qpu/qpu_disasm.h"
+
+static inline struct qpu_reg
+qpu_reg(int index)
+{
+ struct qpu_reg reg = {
+ .magic = false,
+ .index = index,
+ };
+ return reg;
+}
+
+static inline struct qpu_reg
+qpu_magic(enum v3d_qpu_waddr waddr)
+{
+ struct qpu_reg reg = {
+ .magic = true,
+ .index = waddr,
+ };
+ return reg;
+}
+
+static inline struct qpu_reg
+qpu_acc(int acc)
+{
+ return qpu_magic(V3D_QPU_WADDR_R0 + acc);
+}
+
+struct v3d_qpu_instr
+v3d_qpu_nop(void)
+{
+ struct v3d_qpu_instr instr = {
+ .type = V3D_QPU_INSTR_TYPE_ALU,
+ .alu = {
+ .add = {
+ .op = V3D_QPU_A_NOP,
+ .waddr = V3D_QPU_WADDR_NOP,
+ .magic_write = true,
+ },
+ .mul = {
+ .op = V3D_QPU_M_NOP,
+ .waddr = V3D_QPU_WADDR_NOP,
+ .magic_write = true,
+ },
+ }
+ };
+
+ return instr;
+}
+
+static struct qinst *
+vir_nop(void)
+{
+ struct qreg undef = { QFILE_NULL, 0 };
+ struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
+
+ return qinst;
+}
+
+static struct qinst *
+new_qpu_nop_before(struct qinst *inst)
+{
+ struct qinst *q = vir_nop();
+
+ list_addtail(&q->link, &inst->link);
+
+ return q;
+}
+
+static void
+new_ldunif_instr(struct qinst *inst, int i)
+{
+ struct qinst *ldunif = new_qpu_nop_before(inst);
+
+ ldunif->qpu.sig.ldunif = true;
+ assert(inst->src[i].file == QFILE_UNIF);
+ ldunif->uniform = inst->src[i].index;
+}
+
+/**
+ * Allocates the src register (accumulator or register file) into the RADDR
+ * fields of the instruction.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+{
+ if (src.magic) {
+ assert(src.index >= V3D_QPU_WADDR_R0 &&
+ src.index <= V3D_QPU_WADDR_R5);
+ *mux = src.index - V3D_QPU_WADDR_R0 + V3D_QPU_MUX_R0;
+ return;
+ }
+
+ if (instr->alu.add.a != V3D_QPU_MUX_A &&
+ instr->alu.add.b != V3D_QPU_MUX_A &&
+ instr->alu.mul.a != V3D_QPU_MUX_A &&
+ instr->alu.mul.b != V3D_QPU_MUX_A) {
+ instr->raddr_a = src.index;
+ *mux = V3D_QPU_MUX_A;
+ } else {
+ if (instr->raddr_a == src.index) {
+ *mux = V3D_QPU_MUX_A;
+ } else {
+ assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
+ instr->alu.add.b == V3D_QPU_MUX_B &&
+ instr->alu.mul.a == V3D_QPU_MUX_B &&
+ instr->alu.mul.b == V3D_QPU_MUX_B) ||
+ src.index == instr->raddr_b);
+
+ instr->raddr_b = src.index;
+ *mux = V3D_QPU_MUX_B;
+ }
+ }
+}
+
+static void
+v3d_generate_code_block(struct v3d_compile *c,
+ struct qblock *block,
+ struct qpu_reg *temp_registers)
+{
+ int last_vpm_read_index = -1;
+
+ vir_for_each_inst(qinst, block) {
+#if 0
+ fprintf(stderr, "translating qinst to qpu: ");
+ vir_dump_inst(c, qinst);
+ fprintf(stderr, "\n");
+#endif
+
+ struct qinst *temp;
+
+ if (vir_has_implicit_uniform(qinst)) {
+ int src = vir_get_implicit_uniform_src(qinst);
+ assert(qinst->src[src].file == QFILE_UNIF);
+ qinst->uniform = qinst->src[src].index;
+ c->num_uniforms++;
+ }
+
+ int nsrc = vir_get_non_sideband_nsrc(qinst);
+ struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+ bool emitted_ldunif = false;
+ for (int i = 0; i < nsrc; i++) {
+ int index = qinst->src[i].index;
+ switch (qinst->src[i].file) {
+ case QFILE_REG:
+ src[i] = qpu_reg(qinst->src[i].index);
+ break;
+ case QFILE_MAGIC:
+ src[i] = qpu_magic(qinst->src[i].index);
+ break;
+ case QFILE_NULL:
+ case QFILE_LOAD_IMM:
+ src[i] = qpu_acc(0);
+ break;
+ case QFILE_TEMP:
+ src[i] = temp_registers[index];
+ break;
+ case QFILE_UNIF:
+ if (!emitted_ldunif) {
+ new_ldunif_instr(qinst, i);
+ c->num_uniforms++;
+ emitted_ldunif = true;
+ }
+
+ src[i] = qpu_acc(5);
+ break;
+ case QFILE_VARY:
+ temp = new_qpu_nop_before(qinst);
+ temp->qpu.sig.ldvary = true;
+
+ src[i] = qpu_acc(3);
+ break;
+ case QFILE_SMALL_IMM:
+ abort(); /* XXX */
+#if 0
+ src[i].mux = QPU_MUX_SMALL_IMM;
+ src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
+ /* This should only have returned a valid
+ * small immediate field, not ~0 for failure.
+ */
+ assert(src[i].addr <= 47);
+#endif
+ break;
+
+ case QFILE_VPM:
+ assert((int)qinst->src[i].index >=
+ last_vpm_read_index);
+ (void)last_vpm_read_index;
+ last_vpm_read_index = qinst->src[i].index;
+
+ temp = new_qpu_nop_before(qinst);
+ temp->qpu.sig.ldvpm = true;
+
+ src[i] = qpu_acc(3);
+ break;
+
+ case QFILE_TLB:
+ case QFILE_TLBU:
+ unreachable("bad vir src file");
+ }
+ }
+
+ struct qpu_reg dst;
+ switch (qinst->dst.file) {
+ case QFILE_NULL:
+ dst = qpu_magic(V3D_QPU_WADDR_NOP);
+ break;
+
+ case QFILE_REG:
+ dst = qpu_reg(qinst->dst.index);
+ break;
+
+ case QFILE_MAGIC:
+ dst = qpu_magic(qinst->dst.index);
+ break;
+
+ case QFILE_TEMP:
+ dst = temp_registers[qinst->dst.index];
+ break;
+
+ case QFILE_VPM:
+ dst = qpu_magic(V3D_QPU_WADDR_VPM);
+ break;
+
+ case QFILE_TLB:
+ dst = qpu_magic(V3D_QPU_WADDR_TLB);
+ break;
+
+ case QFILE_TLBU:
+ dst = qpu_magic(V3D_QPU_WADDR_TLBU);
+ break;
+
+ case QFILE_VARY:
+ case QFILE_UNIF:
+ case QFILE_SMALL_IMM:
+ case QFILE_LOAD_IMM:
+ assert(!"not reached");
+ break;
+ }
+
+ if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+ if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+ assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+ if (nsrc >= 1) {
+ set_src(&qinst->qpu,
+ &qinst->qpu.alu.add.a, src[0]);
+ }
+ if (nsrc >= 2) {
+ set_src(&qinst->qpu,
+ &qinst->qpu.alu.add.b, src[1]);
+ }
+
+ qinst->qpu.alu.add.waddr = dst.index;
+ qinst->qpu.alu.add.magic_write = dst.magic;
+ } else {
+ if (nsrc >= 1) {
+ set_src(&qinst->qpu,
+ &qinst->qpu.alu.mul.a, src[0]);
+ }
+ if (nsrc >= 2) {
+ set_src(&qinst->qpu,
+ &qinst->qpu.alu.mul.b, src[1]);
+ }
+
+ qinst->qpu.alu.mul.waddr = dst.index;
+ qinst->qpu.alu.mul.magic_write = dst.magic;
+ }
+ } else {
+ assert(qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+ }
+ }
+}
+
+
+static void
+v3d_dump_qpu(struct v3d_compile *c)
+{
+ fprintf(stderr, "%s prog %d/%d QPU:\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
+
+ for (int i = 0; i < c->qpu_inst_count; i++) {
+ const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
+ fprintf(stderr, "0x%016"PRIx64" %s\n", c->qpu_insts[i], str);
+ }
+ fprintf(stderr, "\n");
+}
+
+void
+v3d_vir_to_qpu(struct v3d_compile *c)
+{
+ struct qpu_reg *temp_registers = v3d_register_allocate(c);
+ struct qblock *end_block = list_last_entry(&c->blocks,
+ struct qblock, link);
+
+ /* Reset the uniform count to how many will be actually loaded by the
+ * generated QPU code.
+ */
+ c->num_uniforms = 0;
+
+ vir_for_each_block(block, c)
+ v3d_generate_code_block(c, block, temp_registers);
+
+ struct qinst *thrsw = vir_nop();
+ list_addtail(&thrsw->link, &end_block->instructions);
+ thrsw->qpu.sig.thrsw = true;
+
+ uint32_t cycles = v3d_qpu_schedule_instructions(c);
+
+ c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);
+ int i = 0;
+ vir_for_each_inst_inorder(inst, c) {
+ bool ok = v3d_qpu_instr_pack(c->devinfo, &inst->qpu,
+ &c->qpu_insts[i++]);
+ assert(ok); (void) ok;
+ }
+ assert(i == c->qpu_inst_count);
+
+ if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+ fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id,
+ cycles);
+ }
+
+ if (V3D_DEBUG & (V3D_DEBUG_QPU |
+ v3d_debug_flag_for_shader_stage(c->s->stage))) {
+ v3d_dump_qpu(c);
+ }
+
+ qpu_validate(c);
+
+ free(temp_registers);
+}