*/
#include <inttypes.h>
-#include "util/u_format.h"
+#include "util/format/u_format.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/ralloc.h"
#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
+/* We don't do any address packing. */
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#include "cle/v3d_packet_v41_pack.h"
+
#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
-#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3)
-#define GENERAL_TMU_READ_OP_READ (15 << 3)
#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3)
-#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3)
-
#define V3D_TSY_SET_QUORUM 0
#define V3D_TSY_INC_WAITERS 1
#define V3D_TSY_DEC_WAITERS 2
c->last_thrsw = vir_NOP(c);
c->last_thrsw->qpu.sig.thrsw = true;
c->last_thrsw_at_top_level = !c->in_control_flow;
+
+ /* We need to lock the scoreboard before any tlb acess happens. If this
+ * thread switch comes after we have emitted a tlb load, then it means
+ * that we can't lock on the last thread switch any more.
+ */
+ if (c->emitted_tlb_load)
+ c->lock_scoreboard_on_first_thrsw = true;
+}
+
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
+{
+ if (nir_src_is_const(instr->src[src])) {
+ int64_t add_val = nir_src_as_int(instr->src[src]);
+ if (add_val == 1)
+ return V3D_TMU_OP_WRITE_AND_READ_INC;
+ else if (add_val == -1)
+ return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ }
+
+ return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
}
static uint32_t
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
- return GENERAL_TMU_READ_OP_READ;
+ case nir_intrinsic_load_scratch:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
- return GENERAL_TMU_WRITE_OP_WRITE;
+ case nir_intrinsic_store_scratch:
+ return V3D_TMU_OP_REGULAR;
case nir_intrinsic_ssbo_atomic_add:
+ return v3d_get_op_for_atomic_add(instr, 2);
case nir_intrinsic_shared_atomic_add:
- return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
+ return v3d_get_op_for_atomic_add(instr, 1);
case nir_intrinsic_ssbo_atomic_imin:
case nir_intrinsic_shared_atomic_imin:
- return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
+ return V3D_TMU_OP_WRITE_SMIN;
case nir_intrinsic_ssbo_atomic_umin:
case nir_intrinsic_shared_atomic_umin:
- return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
+ return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
case nir_intrinsic_ssbo_atomic_imax:
case nir_intrinsic_shared_atomic_imax:
- return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
+ return V3D_TMU_OP_WRITE_SMAX;
case nir_intrinsic_ssbo_atomic_umax:
case nir_intrinsic_shared_atomic_umax:
- return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
+ return V3D_TMU_OP_WRITE_UMAX;
case nir_intrinsic_ssbo_atomic_and:
case nir_intrinsic_shared_atomic_and:
- return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
+ return V3D_TMU_OP_WRITE_AND_READ_INC;
case nir_intrinsic_ssbo_atomic_or:
case nir_intrinsic_shared_atomic_or:
- return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
+ return V3D_TMU_OP_WRITE_OR_READ_DEC;
case nir_intrinsic_ssbo_atomic_xor:
case nir_intrinsic_shared_atomic_xor:
- return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
+ return V3D_TMU_OP_WRITE_XOR_READ_NOT;
case nir_intrinsic_ssbo_atomic_exchange:
case nir_intrinsic_shared_atomic_exchange:
- return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
+ return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_shared_atomic_comp_swap:
- return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
+ return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
default:
unreachable("unknown intrinsic op");
}
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
- bool is_shared)
+ bool is_shared_or_scratch)
{
- /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
- * wants to have support for inc/dec?
+ uint32_t tmu_op = v3d_general_tmu_op(instr);
+
+ /* If we were able to replace atomic_add for an inc/dec, then we
+ * need/can to do things slightly different, like not loading the
+ * amount to add/sub, as that is implicit.
*/
+ bool atomic_add_replaced =
+ ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
+ instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+ (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+ tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
- uint32_t tmu_op = v3d_general_tmu_op(instr);
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+ instr->intrinsic == nir_intrinsic_store_scratch ||
instr->intrinsic == nir_intrinsic_store_shared);
- bool has_index = !is_shared;
+
+ bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
+ instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_ssbo ||
+ instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_load_shared);
+
+ if (!is_load)
+ c->tmu_dirty_rcl = true;
+
+ bool has_index = !is_shared_or_scratch;
int offset_src;
- int tmu_writes = 1; /* address */
if (instr->intrinsic == nir_intrinsic_load_uniform) {
offset_src = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_ubo ||
- instr->intrinsic == nir_intrinsic_load_shared) {
+ instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_load_shared ||
+ atomic_add_replaced) {
offset_src = 0 + has_index;
} else if (is_store) {
offset_src = 1 + has_index;
- for (int i = 0; i < instr->num_components; i++) {
- vir_MOV_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[0], i));
- tmu_writes++;
- }
} else {
offset_src = 0 + has_index;
- vir_MOV_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[1 + has_index], 0));
- tmu_writes++;
- if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
- vir_MOV_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[2 + has_index],
- 0));
- tmu_writes++;
- }
}
+ bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
uint32_t const_offset = 0;
- if (nir_src_is_const(instr->src[offset_src]))
+ if (!dynamic_src)
const_offset = nir_src_as_uint(instr->src[offset_src]);
- /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
- * storing at the same time.
- */
- while (tmu_writes > 16 / c->threads)
- c->threads /= 2;
-
- struct qreg offset;
+ struct qreg base_offset;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
const_offset += nir_intrinsic_base(instr);
- offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
- v3d_unit_data_create(0, const_offset));
+ base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(0, const_offset));
const_offset = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
/* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
* 1 (0 is gallium's constant buffer 0).
*/
- offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
- v3d_unit_data_create(index, const_offset));
+ base_offset =
+ vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(index, const_offset));
const_offset = 0;
- } else if (is_shared) {
- /* Shared variables have no buffer index, and all start from a
- * common base that we set up at the start of dispatch
+ } else if (is_shared_or_scratch) {
+ /* Shared and scratch variables have no buffer index, and all
+ * start from a common base that we set up at the start of
+ * dispatch.
*/
- offset = c->cs_shared_offset;
+ if (instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_store_scratch) {
+ base_offset = c->spill_base;
+ } else {
+ base_offset = c->cs_shared_offset;
+ const_offset += nir_intrinsic_base(instr);
+ }
} else {
- offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
- nir_src_as_uint(instr->src[is_store ?
- 1 : 0]));
+ base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+ nir_src_as_uint(instr->src[is_store ?
+ 1 : 0]));
}
- uint32_t config = (0xffffff00 |
- tmu_op |
- GENERAL_TMU_LOOKUP_PER_PIXEL);
- if (instr->num_components == 1) {
- config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
- } else {
- config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
- instr->num_components - 2);
- }
+ struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
+ unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
+ uint32_t base_const_offset = const_offset;
+ int first_component = -1;
+ int last_component = -1;
+ do {
+ int tmu_writes = 1; /* address */
- if (vir_in_nonuniform_control_flow(c)) {
- vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
- V3D_QPU_PF_PUSHZ);
- }
+ if (is_store) {
+ /* Find the first set of consecutive components that
+ * are enabled in the writemask and emit the TMUD
+ * instructions for them.
+ */
+ first_component = ffs(writemask) - 1;
+ last_component = first_component;
+ while (writemask & BITFIELD_BIT(last_component + 1))
+ last_component++;
+
+ assert(first_component >= 0 &&
+ first_component <= last_component &&
+ last_component < instr->num_components);
+
+ struct qreg tmud = vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_TMUD);
+ for (int i = first_component; i <= last_component; i++) {
+ struct qreg data =
+ ntq_get_src(c, instr->src[0], i);
+ vir_MOV_dest(c, tmud, data);
+ tmu_writes++;
+ }
- struct qreg dest;
- if (config == ~0)
- dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
- else
- dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+ /* Update the offset for the TMU write based on the
+ * the first component we are writing.
+ */
+ const_offset = base_const_offset + first_component * 4;
+
+ /* Clear these components from the writemask */
+ uint32_t written_mask =
+ BITFIELD_RANGE(first_component, tmu_writes - 1);
+ writemask &= ~written_mask;
+ } else if (!is_load && !atomic_add_replaced) {
+ struct qreg data =
+ ntq_get_src(c, instr->src[1 + has_index], 0);
+ vir_MOV_dest(c, tmud, data);
+ tmu_writes++;
+ if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+ data = ntq_get_src(c, instr->src[2 + has_index],
+ 0);
+ vir_MOV_dest(c, tmud, data);
+ tmu_writes++;
+ }
+ }
- struct qinst *tmu;
- if (nir_src_is_const(instr->src[offset_src]) && const_offset == 0) {
- tmu = vir_MOV_dest(c, dest, offset);
- } else {
- tmu = vir_ADD_dest(c, dest,
- offset,
- ntq_get_src(c, instr->src[offset_src], 0));
- }
+ /* Make sure we won't exceed the 16-entry TMU fifo if each
+ * thread is storing at the same time.
+ */
+ while (tmu_writes > 16 / c->threads)
+ c->threads /= 2;
- if (config != ~0) {
- tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
- config);
- }
+ /* The spec says that for atomics, the TYPE field is ignored,
+ * but that doesn't seem to be the case for CMPXCHG. Just use
+ * the number of tmud writes we did to decide the type (or
+ * choose "32bit" for atomic reads, which has been fine).
+ */
+ uint32_t num_components;
+ if (is_load || atomic_add_replaced) {
+ num_components = instr->num_components;
+ } else {
+ assert(tmu_writes > 1);
+ num_components = tmu_writes - 1;
+ }
- if (vir_in_nonuniform_control_flow(c))
- vir_set_cond(tmu, V3D_QPU_COND_IFA);
+ uint32_t config = (0xffffff00 |
+ tmu_op << 3|
+ GENERAL_TMU_LOOKUP_PER_PIXEL);
+ if (num_components == 1) {
+ config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ } else {
+ config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+ num_components - 2;
+ }
- vir_emit_thrsw(c);
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
- /* Read the result, or wait for the TMU op to complete. */
- for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
- ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+ struct qreg tmua;
+ if (config == ~0)
+ tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+ else
+ tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+ struct qinst *tmu;
+ if (dynamic_src) {
+ struct qreg offset = base_offset;
+ if (const_offset != 0) {
+ offset = vir_ADD(c, offset,
+ vir_uniform_ui(c, const_offset));
+ }
+ struct qreg data =
+ ntq_get_src(c, instr->src[offset_src], 0);
+ tmu = vir_ADD_dest(c, tmua, offset, data);
+ } else {
+ if (const_offset != 0) {
+ tmu = vir_ADD_dest(c, tmua, base_offset,
+ vir_uniform_ui(c, const_offset));
+ } else {
+ tmu = vir_MOV_dest(c, tmua, base_offset);
+ }
+ }
- if (nir_intrinsic_dest_components(instr) == 0)
- vir_TMUWT(c);
+ if (config != ~0) {
+ tmu->uniform =
+ vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+ config);
+ }
+
+ if (vir_in_nonuniform_control_flow(c))
+ vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+ vir_emit_thrsw(c);
+
+ /* Read the result, or wait for the TMU op to complete. */
+ for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, vir_LDTMU(c)));
+ }
+
+ if (nir_intrinsic_dest_components(instr) == 0)
+ vir_TMUWT(c);
+ } while (is_store && writemask != 0);
}
static struct qreg *
return qregs;
}
+static bool
+is_ld_signal(const struct v3d_qpu_sig *sig)
+{
+ return (sig->ldunif ||
+ sig->ldunifa ||
+ sig->ldunifrf ||
+ sig->ldunifarf ||
+ sig->ldtmu ||
+ sig->ldvary ||
+ sig->ldvpm ||
+ sig->ldtlb ||
+ sig->ldtlbu);
+}
+
/**
* This function is responsible for getting VIR results into the associated
* storage for a NIR instruction.
struct qreg result)
{
struct qinst *last_inst = NULL;
- if (!list_empty(&c->cur_block->instructions))
+ if (!list_is_empty(&c->cur_block->instructions))
last_inst = (struct qinst *)c->cur_block->instructions.prev;
assert((result.file == QFILE_TEMP &&
_mesa_hash_table_search(c->def_ht, reg);
struct qreg *qregs = entry->data;
- /* Insert a MOV if the source wasn't an SSA def in the
- * previous instruction.
+ /* If the previous instruction can't be predicated for
+ * the store into the nir_register, then emit a MOV
+ * that can be.
*/
- if ((vir_in_nonuniform_control_flow(c) &&
- c->defs[last_inst->dst.index]->qpu.sig.ldunif)) {
+ if (vir_in_nonuniform_control_flow(c) &&
+ is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) {
result = vir_MOV(c, result);
last_inst = c->defs[result.index];
}
struct qreg result;
switch (instr->op) {
- case nir_op_fmov:
- case nir_op_imov:
+ case nir_op_mov:
result = vir_MOV(c, src[0]);
break;
case nir_op_sge:
case nir_op_slt: {
enum v3d_qpu_cond cond;
- MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+ ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
assert(ok);
result = vir_MOV(c, vir_SEL(c, cond,
vir_uniform_f(c, 1.0),
case nir_op_ilt32:
case nir_op_ult32: {
enum v3d_qpu_cond cond;
- MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+ ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
assert(ok);
result = vir_MOV(c, vir_SEL(c, cond,
vir_uniform_ui(c, ~0),
/* Stencil is a single 32-bit write. */
#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
+static void
+vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
+{
+ if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
+ return;
+
+ struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
+ struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
+
+ nir_variable *var = c->output_color_var[rt];
+ int num_components = glsl_get_vector_elements(var->type);
+ uint32_t conf = 0xffffff00;
+ struct qinst *inst;
+
+ conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
+ TLB_SAMPLE_MODE_PER_PIXEL;
+ conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+ if (c->fs_key->swap_color_rb & (1 << rt))
+ num_components = MAX2(num_components, 3);
+ assert(num_components != 0);
+
+ enum glsl_base_type type = glsl_get_base_type(var->type);
+ bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
+ bool is_32b_tlb_format = is_int_format ||
+ (c->fs_key->f32_color_rb & (1 << rt));
+
+ if (is_int_format) {
+ /* The F32 vs I32 distinction was dropped in 4.2. */
+ if (c->devinfo->ver < 42)
+ conf |= TLB_TYPE_I32_COLOR;
+ else
+ conf |= TLB_TYPE_F32_COLOR;
+ conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
+ } else {
+ if (c->fs_key->f32_color_rb & (1 << rt)) {
+ conf |= TLB_TYPE_F32_COLOR;
+ conf |= ((num_components - 1) <<
+ TLB_VEC_SIZE_MINUS_1_SHIFT);
+ } else {
+ conf |= TLB_TYPE_F16_COLOR;
+ conf |= TLB_F16_SWAP_HI_LO;
+ if (num_components >= 3)
+ conf |= TLB_VEC_SIZE_4_F16;
+ else
+ conf |= TLB_VEC_SIZE_2_F16;
+ }
+ }
+
+ int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
+ for (int i = 0; i < num_samples; i++) {
+ struct qreg *color = c->msaa_per_sample_output ?
+ &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
+ &c->outputs[var->data.driver_location * 4];
+
+ struct qreg r = color[0];
+ struct qreg g = color[1];
+ struct qreg b = color[2];
+ struct qreg a = color[3];
+
+ if (c->fs_key->swap_color_rb & (1 << rt)) {
+ r = color[2];
+ b = color[0];
+ }
+
+ if (c->fs_key->sample_alpha_to_one)
+ a = vir_uniform_f(c, 1.0);
+
+ if (is_32b_tlb_format) {
+ if (i == 0) {
+ inst = vir_MOV_dest(c, tlbu_reg, r);
+ inst->uniform =
+ vir_get_uniform_index(c,
+ QUNIFORM_CONSTANT,
+ conf);
+ } else {
+ inst = vir_MOV_dest(c, tlb_reg, r);
+ }
+
+ if (num_components >= 2)
+ vir_MOV_dest(c, tlb_reg, g);
+ if (num_components >= 3)
+ vir_MOV_dest(c, tlb_reg, b);
+ if (num_components >= 4)
+ vir_MOV_dest(c, tlb_reg, a);
+ } else {
+ inst = vir_VFPACK_dest(c, tlb_reg, r, g);
+ if (conf != ~0 && i == 0) {
+ inst->dst = tlbu_reg;
+ inst->uniform =
+ vir_get_uniform_index(c,
+ QUNIFORM_CONSTANT,
+ conf);
+ }
+
+ if (num_components >= 3)
+ inst = vir_VFPACK_dest(c, tlb_reg, b, a);
+ }
+ }
+}
+
static void
emit_frag_end(struct v3d_compile *c)
{
vir_FTOC(c, color[3])));
}
- struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
if (c->output_position_index != -1) {
struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
/* XXX: Performance improvement: Merge Z write and color writes TLB
* uniform setup
*/
+ for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
+ vir_emit_tlb_color_write(c, rt);
+}
- for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
- if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
- continue;
-
- nir_variable *var = c->output_color_var[rt];
- struct qreg *color = &c->outputs[var->data.driver_location * 4];
- int num_components = glsl_get_vector_elements(var->type);
- uint32_t conf = 0xffffff00;
- struct qinst *inst;
-
- conf |= TLB_SAMPLE_MODE_PER_PIXEL;
- conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
-
- if (c->fs_key->swap_color_rb & (1 << rt))
- num_components = MAX2(num_components, 3);
-
- assert(num_components != 0);
- switch (glsl_get_base_type(var->type)) {
- case GLSL_TYPE_UINT:
- case GLSL_TYPE_INT:
- /* The F32 vs I32 distinction was dropped in 4.2. */
- if (c->devinfo->ver < 42)
- conf |= TLB_TYPE_I32_COLOR;
- else
- conf |= TLB_TYPE_F32_COLOR;
- conf |= ((num_components - 1) <<
- TLB_VEC_SIZE_MINUS_1_SHIFT);
-
- inst = vir_MOV_dest(c, tlbu_reg, color[0]);
- inst->uniform = vir_get_uniform_index(c,
- QUNIFORM_CONSTANT,
- conf);
-
- for (int i = 1; i < num_components; i++) {
- inst = vir_MOV_dest(c, tlb_reg, color[i]);
- }
- break;
-
- default: {
- struct qreg r = color[0];
- struct qreg g = color[1];
- struct qreg b = color[2];
- struct qreg a = color[3];
-
- if (c->fs_key->f32_color_rb & (1 << rt)) {
- conf |= TLB_TYPE_F32_COLOR;
- conf |= ((num_components - 1) <<
- TLB_VEC_SIZE_MINUS_1_SHIFT);
- } else {
- conf |= TLB_TYPE_F16_COLOR;
- conf |= TLB_F16_SWAP_HI_LO;
- if (num_components >= 3)
- conf |= TLB_VEC_SIZE_4_F16;
- else
- conf |= TLB_VEC_SIZE_2_F16;
- }
-
- if (c->fs_key->swap_color_rb & (1 << rt)) {
- r = color[2];
- b = color[0];
- }
-
- if (c->fs_key->sample_alpha_to_one)
- a = vir_uniform_f(c, 1.0);
-
- if (c->fs_key->f32_color_rb & (1 << rt)) {
- inst = vir_MOV_dest(c, tlbu_reg, r);
- inst->uniform = vir_get_uniform_index(c,
- QUNIFORM_CONSTANT,
- conf);
-
- if (num_components >= 2)
- vir_MOV_dest(c, tlb_reg, g);
- if (num_components >= 3)
- vir_MOV_dest(c, tlb_reg, b);
- if (num_components >= 4)
- vir_MOV_dest(c, tlb_reg, a);
- } else {
- inst = vir_VFPACK_dest(c, tlb_reg, r, g);
- if (conf != ~0) {
- inst->dst = tlbu_reg;
- inst->uniform = vir_get_uniform_index(c,
- QUNIFORM_CONSTANT,
- conf);
- }
-
- if (num_components >= 3)
- inst = vir_VFPACK_dest(c, tlb_reg, b, a);
- }
- break;
- }
- }
- }
+static inline void
+vir_VPM_WRITE_indirect(struct v3d_compile *c,
+ struct qreg val,
+ struct qreg vpm_index)
+{
+ assert(c->devinfo->ver >= 40);
+ vir_STVPMV(c, vpm_index, val);
}
static void
vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
{
if (c->devinfo->ver >= 40) {
- vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
+ vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
} else {
/* XXX: v3d33_vir_vpm_write_setup(c); */
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
vir_VPMWT(c);
}
+static void
+emit_geom_end(struct v3d_compile *c)
+{
+ /* GFXH-1684: VPM writes need to be complete by the end of the shader.
+ */
+ if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+ vir_VPMWT(c);
+}
+
void
v3d_optimize_nir(struct nir_shader *s)
{
bool progress;
+ unsigned lower_flrp =
+ (s->options->lower_flrp16 ? 16 : 0) |
+ (s->options->lower_flrp32 ? 32 : 0) |
+ (s->options->lower_flrp64 ? 64 : 0);
do {
progress = false;
NIR_PASS_V(s, nir_lower_vars_to_ssa);
- NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
NIR_PASS(progress, s, nir_lower_phis_to_scalar);
NIR_PASS(progress, s, nir_copy_prop);
NIR_PASS(progress, s, nir_opt_remove_phis);
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
+
+ if (lower_flrp != 0) {
+ bool lower_flrp_progress = false;
+
+ NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
+ lower_flrp,
+ false /* always_precise */,
+ s->options->lower_ffma);
+ if (lower_flrp_progress) {
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ progress = true;
+ }
+
+ /* Nothing should rematerialize any flrps, so we only
+ * need to do this lowering once.
+ */
+ lower_flrp = 0;
+ }
+
NIR_PASS(progress, s, nir_opt_undef);
} while (progress);
- NIR_PASS(progress, s, nir_opt_move_load_ubo);
+ NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
}
static int
}
static void
-ntq_setup_vpm_inputs(struct v3d_compile *c)
+ntq_setup_vs_inputs(struct v3d_compile *c)
{
/* Figure out how many components of each vertex attribute the shader
* uses. Each variable should have been split to individual
}
}
-static void
-ntq_setup_fs_inputs(struct v3d_compile *c)
+static bool
+var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
+{
+ return (var->data.location == VARYING_SLOT_PNTC ||
+ (var->data.location >= VARYING_SLOT_VAR0 &&
+ (c->fs_key->point_sprite_mask &
+ (1 << (var->data.location - VARYING_SLOT_VAR0)))));
+}
+
+static bool
+program_reads_point_coord(struct v3d_compile *c)
{
- unsigned num_entries = 0;
- unsigned num_components = 0;
nir_foreach_variable(var, &c->s->inputs) {
- num_entries++;
- num_components += glsl_get_components(var->type);
+ if (var_needs_point_coord(c, var))
+ return true;
}
- nir_variable *vars[num_entries];
+ return false;
+}
+
+static void
+get_sorted_input_variables(struct v3d_compile *c,
+ unsigned *num_entries,
+ nir_variable ***vars)
+{
+ *num_entries = 0;
+ nir_foreach_variable(var, &c->s->inputs)
+ (*num_entries)++;
+
+ *vars = ralloc_array(c, nir_variable *, *num_entries);
unsigned i = 0;
nir_foreach_variable(var, &c->s->inputs)
- vars[i++] = var;
+ (*vars)[i++] = var;
/* Sort the variables so that we emit the input setup in
* driver_location order. This is required for VPM reads, whose data
* is fetched into the VPM in driver_location (TGSI register index)
* order.
*/
- qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
+ qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
+}
+
+static void
+ntq_setup_gs_inputs(struct v3d_compile *c)
+{
+ nir_variable **vars;
+ unsigned num_entries;
+ get_sorted_input_variables(c, &num_entries, &vars);
+
+ for (unsigned i = 0; i < num_entries; i++) {
+ nir_variable *var = vars[i];
+
+ /* All GS inputs are arrays with as many entries as vertices
+ * in the input primitive, but here we only care about the
+ * per-vertex input type.
+ */
+ const struct glsl_type *type = glsl_without_array(var->type);
+ unsigned array_len = MAX2(glsl_get_length(type), 1);
+ unsigned loc = var->data.driver_location;
+
+ resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+ (loc + array_len) * 4);
+
+ for (unsigned j = 0; j < array_len; j++) {
+ unsigned num_elements = glsl_get_vector_elements(type);
+ for (unsigned k = 0; k < num_elements; k++) {
+ unsigned chan = var->data.location_frac + k;
+ unsigned input_idx = c->num_inputs++;
+ struct v3d_varying_slot slot =
+ v3d_slot_from_slot_and_component(var->data.location + j, chan);
+ c->input_slots[input_idx] = slot;
+ }
+ }
+ }
+}
+
+
+static void
+ntq_setup_fs_inputs(struct v3d_compile *c)
+{
+ nir_variable **vars;
+ unsigned num_entries;
+ get_sorted_input_variables(c, &num_entries, &vars);
for (unsigned i = 0; i < num_entries; i++) {
nir_variable *var = vars[i];
if (var->data.location == VARYING_SLOT_POS) {
emit_fragcoord_input(c, loc);
- } else if (var->data.location == VARYING_SLOT_PNTC ||
- (var->data.location >= VARYING_SLOT_VAR0 &&
- (c->fs_key->point_sprite_mask &
- (1 << (var->data.location -
- VARYING_SLOT_VAR0))))) {
+ } else if (var_needs_point_coord(c, var)) {
c->inputs[loc * 4 + 0] = c->point_x;
c->inputs[loc * 4 + 1] = c->point_y;
} else {
*/
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
for (int i = 0; i < instr->def.num_components; i++)
- qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
+ qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
}
static void
ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
- assert(instr->intrinsic == nir_intrinsic_image_deref_size);
- nir_variable *var = nir_intrinsic_get_var(instr, 0);
- unsigned image_index = var->data.driver_location;
- const struct glsl_type *sampler_type = glsl_without_array(var->type);
- bool is_array = glsl_sampler_type_is_array(sampler_type);
+ unsigned image_index = nir_src_as_uint(instr->src[0]);
+ bool is_array = nir_intrinsic_image_array(instr);
ntq_store_dest(c, &instr->dest, 0,
vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
}
static void
-ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
- unsigned offset;
+ assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
- switch (instr->intrinsic) {
- case nir_intrinsic_load_uniform:
- if (nir_src_is_const(instr->src[0])) {
- int offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- assert(offset % 4 == 0);
- /* We need dwords */
- offset = offset / 4;
- for (int i = 0; i < instr->num_components; i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_UNIFORM,
- offset + i));
- }
+ int rt = nir_src_as_uint(instr->src[0]);
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ int sample_index = nir_intrinsic_base(instr) ;
+ assert(sample_index < V3D_MAX_SAMPLES);
+
+ int component = nir_intrinsic_component(instr);
+ assert(component < 4);
+
+ /* We need to emit our TLB reads after we have acquired the scoreboard
+ * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+ * the last thread switch to improve parallelism, however, that is only
+ * guaranteed to happen before the tlb color writes.
+ *
+ * To fix that, we make sure we always emit a thread switch before the
+ * first tlb color read. If that happens to be the last thread switch
+ * we emit, then everything is fine, but otherwsie, if any code after
+ * this point needs to emit additional thread switches, then we will
+ * switch the strategy to locking the scoreboard on the first thread
+ * switch instead -- see vir_emit_thrsw().
+ */
+ if (!c->emitted_tlb_load) {
+ if (!c->last_thrsw_at_top_level) {
+ assert(c->devinfo->ver >= 41);
+ vir_emit_thrsw(c);
+ }
+
+ c->emitted_tlb_load = true;
+ }
+
+ struct qreg *color_reads_for_sample =
+ &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
+
+ if (color_reads_for_sample[component].file == QFILE_NULL) {
+ enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
+ int num_components =
+ util_format_get_nr_components(rt_format);
+
+ const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
+ if (swap_rb)
+ num_components = MAX2(num_components, 3);
+
+ nir_variable *var = c->output_color_var[rt];
+ enum glsl_base_type type = glsl_get_base_type(var->type);
+
+ bool is_int_format = type == GLSL_TYPE_INT ||
+ type == GLSL_TYPE_UINT;
+
+ bool is_32b_tlb_format = is_int_format ||
+ (c->fs_key->f32_color_rb & (1 << rt));
+
+ int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
+
+ uint32_t conf = 0xffffff00;
+ conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
+ TLB_SAMPLE_MODE_PER_PIXEL;
+ conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+ if (is_32b_tlb_format) {
+ /* The F32 vs I32 distinction was dropped in 4.2. */
+ conf |= (c->devinfo->ver < 42 && is_int_format) ?
+ TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
+
+ conf |= ((num_components - 1) <<
+ TLB_VEC_SIZE_MINUS_1_SHIFT);
} else {
- ntq_emit_tmu_general(c, instr, false);
+ conf |= TLB_TYPE_F16_COLOR;
+ conf |= TLB_F16_SWAP_HI_LO;
+
+ if (num_components >= 3)
+ conf |= TLB_VEC_SIZE_4_F16;
+ else
+ conf |= TLB_VEC_SIZE_2_F16;
+ }
+
+
+ for (int i = 0; i < num_samples; i++) {
+ struct qreg r, g, b, a;
+ if (is_32b_tlb_format) {
+ r = conf != 0xffffffff && i == 0?
+ vir_TLBU_COLOR_READ(c, conf) :
+ vir_TLB_COLOR_READ(c);
+ if (num_components >= 2)
+ g = vir_TLB_COLOR_READ(c);
+ if (num_components >= 3)
+ b = vir_TLB_COLOR_READ(c);
+ if (num_components >= 4)
+ a = vir_TLB_COLOR_READ(c);
+ } else {
+ struct qreg rg = conf != 0xffffffff && i == 0 ?
+ vir_TLBU_COLOR_READ(c, conf) :
+ vir_TLB_COLOR_READ(c);
+ r = vir_FMOV(c, rg);
+ vir_set_unpack(c->defs[r.index], 0,
+ V3D_QPU_UNPACK_L);
+ g = vir_FMOV(c, rg);
+ vir_set_unpack(c->defs[g.index], 0,
+ V3D_QPU_UNPACK_H);
+
+ if (num_components > 2) {
+ struct qreg ba = vir_TLB_COLOR_READ(c);
+ b = vir_FMOV(c, ba);
+ vir_set_unpack(c->defs[b.index], 0,
+ V3D_QPU_UNPACK_L);
+ a = vir_FMOV(c, ba);
+ vir_set_unpack(c->defs[a.index], 0,
+ V3D_QPU_UNPACK_H);
+ }
+ }
+
+ struct qreg *color_reads =
+ &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
+
+ color_reads[0] = swap_rb ? b : r;
+ if (num_components >= 2)
+ color_reads[1] = g;
+ if (num_components >= 3)
+ color_reads[2] = swap_rb ? r : b;
+ if (num_components >= 4)
+ color_reads[3] = a;
+ }
+ }
+
+ assert(color_reads_for_sample[component].file != QFILE_NULL);
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static void
+ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ if (nir_src_is_const(instr->src[0])) {
+ int offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[0]));
+ assert(offset % 4 == 0);
+ /* We need dwords */
+ offset = offset / 4;
+ for (int i = 0; i < instr->num_components; i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_uniform(c, QUNIFORM_UNIFORM,
+ offset + i));
}
+ } else {
+ ntq_emit_tmu_general(c, instr, false);
+ }
+}
+
+static void
+ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
+ * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
+ */
+ unsigned offset =
+ nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
+
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+ /* Emit the LDVPM directly now, rather than at the top
+ * of the shader like we did for V3D 3.x (which needs
+ * vpmsetup when not just taking the next offset).
+ *
+ * Note that delaying like this may introduce stalls,
+ * as LDVPMV takes a minimum of 1 instruction but may
+ * be slower if the VPM unit is busy with another QPU.
+ */
+ int index = 0;
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+ index++;
+ }
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+ index++;
+ }
+ for (int i = 0; i < offset; i++)
+ index += c->vattr_sizes[i];
+ index += nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ struct qreg vpm_offset = vir_uniform_ui(c, index++);
+ ntq_store_dest(c, &instr->dest, i,
+ vir_LDVPMV_IN(c, vpm_offset));
+ }
+ } else {
+ for (int i = 0; i < instr->num_components; i++) {
+ int comp = nir_intrinsic_component(instr) + i;
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, c->inputs[offset * 4 + comp]));
+ }
+ }
+}
+
+static void
+ntq_emit_per_sample_color_write(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
+{
+ assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
+
+ unsigned rt = nir_src_as_uint(instr->src[1]);
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ unsigned sample_idx = nir_intrinsic_base(instr);
+ assert(sample_idx < V3D_MAX_SAMPLES);
+
+ unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
+ for (int i = 0; i < instr->num_components; i++) {
+ c->sample_colors[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+}
+
+static void
+ntq_emit_color_write(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
+{
+ unsigned offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[1])) * 4 +
+ nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ c->outputs[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+}
+
+static void
+emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ assert(instr->num_components == 1);
+
+ uint32_t base_offset = nir_intrinsic_base(instr);
+ struct qreg src_offset = ntq_get_src(c, instr->src[1], 0);
+ struct qreg offset =
+ vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset);
+
+ /* Usually, for VS or FS, we only emit outputs once at program end so
+ * our VPM writes are never in non-uniform control flow, but this
+ * is not true for GS, where we are emitting multiple vertices.
+ */
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
+
+ vir_VPM_WRITE_indirect(c, ntq_get_src(c, instr->src[0], 0), offset);
+
+ if (vir_in_nonuniform_control_flow(c)) {
+ struct qinst *last_inst =
+ (struct qinst *)c->cur_block->instructions.prev;
+ vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+ }
+}
+
+static void
+ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ /* XXX perf: Use stvpmv with uniform non-constant offsets and
+ * stvpmd with non-uniform offsets and enable
+ * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ ntq_emit_color_write(c, instr);
+ } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) {
+ emit_store_output_gs(c, instr);
+ } else {
+ assert(c->s->info.stage == MESA_SHADER_VERTEX);
+ assert(instr->num_components == 1);
+
+ vir_VPM_WRITE(c,
+ ntq_get_src(c, instr->src[0], 0),
+ nir_intrinsic_base(instr));
+ }
+}
+
+static void
+ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_uniform:
+ ntq_emit_load_uniform(c, instr);
break;
case nir_intrinsic_load_ubo:
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_load_shared:
case nir_intrinsic_store_shared:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_store_scratch:
ntq_emit_tmu_general(c, instr, true);
break;
- case nir_intrinsic_image_deref_load:
- case nir_intrinsic_image_deref_store:
- case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_min:
- case nir_intrinsic_image_deref_atomic_max:
- case nir_intrinsic_image_deref_atomic_and:
- case nir_intrinsic_image_deref_atomic_or:
- case nir_intrinsic_image_deref_atomic_xor:
- case nir_intrinsic_image_deref_atomic_exchange:
- case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_imin:
+ case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_image_atomic_imax:
+ case nir_intrinsic_image_atomic_umax:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
v3d40_vir_emit_image_load_store(c, instr);
break;
break;
case nir_intrinsic_load_user_clip_plane:
- for (int i = 0; i < instr->num_components; i++) {
+ for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
ntq_store_dest(c, &instr->dest, i,
vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
nir_intrinsic_ucp_id(instr) *
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
break;
+ case nir_intrinsic_load_tlb_color_v3d:
+ vir_emit_tlb_color_read(c, instr);
+ break;
+
case nir_intrinsic_load_input:
- offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- if (c->s->info.stage != MESA_SHADER_FRAGMENT &&
- c->devinfo->ver >= 40) {
- /* Emit the LDVPM directly now, rather than at the top
- * of the shader like we did for V3D 3.x (which needs
- * vpmsetup when not just taking the next offset).
- *
- * Note that delaying like this may introduce stalls,
- * as LDVPMV takes a minimum of 1 instruction but may
- * be slower if the VPM unit is busy with another QPU.
- */
- int index = 0;
- if (c->s->info.system_values_read &
- (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
- index++;
- }
- if (c->s->info.system_values_read &
- (1ull << SYSTEM_VALUE_VERTEX_ID)) {
- index++;
- }
- for (int i = 0; i < offset; i++)
- index += c->vattr_sizes[i];
- index += nir_intrinsic_component(instr);
- for (int i = 0; i < instr->num_components; i++) {
- struct qreg vpm_offset =
- vir_uniform_ui(c, index++);
- ntq_store_dest(c, &instr->dest, i,
- vir_LDVPMV_IN(c, vpm_offset));
- }
- } else {
- for (int i = 0; i < instr->num_components; i++) {
- int comp = nir_intrinsic_component(instr) + i;
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[offset * 4 +
- comp]));
- }
- }
+ ntq_emit_load_input(c, instr);
break;
- case nir_intrinsic_store_output:
- if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
- offset = ((nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[1])) * 4 +
- nir_intrinsic_component(instr));
- for (int i = 0; i < instr->num_components; i++) {
- c->outputs[offset + i] =
- vir_MOV(c,
- ntq_get_src(c,
- instr->src[0], i));
- }
- } else {
- assert(instr->num_components == 1);
+ case nir_intrinsic_store_tlb_sample_color_v3d:
+ ntq_emit_per_sample_color_write(c, instr);
+ break;
- vir_VPM_WRITE(c,
- ntq_get_src(c, instr->src[0], 0),
- nir_intrinsic_base(instr));
- }
+ case nir_intrinsic_store_output:
+ ntq_emit_store_output(c, instr);
break;
- case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_image_size:
ntq_emit_image_size(c, instr);
break;
}
case nir_intrinsic_memory_barrier:
- case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
+ case nir_intrinsic_memory_barrier_tcs_patch:
+ case nir_intrinsic_group_memory_barrier:
/* We don't do any instruction scheduling of these NIR
* instructions between each other, so we just need to make
* sure that the TMU operations before the barrier are flushed
*/
break;
- case nir_intrinsic_barrier:
+ case nir_intrinsic_control_barrier:
/* Emit a TSY op to get all invocations in the workgroup
* (actually supergroup) to block until the last invocation
* reaches the TSY op.
vir_uniform_ui(c, 0xffff)));
break;
+ case nir_intrinsic_load_subgroup_id:
+ ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+ break;
+
+ case nir_intrinsic_load_per_vertex_input: {
+ /* col: vertex index, row = varying index */
+ struct qreg col = ntq_get_src(c, instr->src[0], 0);
+ uint32_t row_idx = nir_intrinsic_base(instr) * 4 +
+ nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ struct qreg row = vir_uniform_ui(c, row_idx++);
+ ntq_store_dest(c, &instr->dest, i,
+ vir_LDVPMG_IN(c, row, col));
+ }
+ break;
+ }
+
+ case nir_intrinsic_emit_vertex:
+ case nir_intrinsic_end_primitive:
+ unreachable("Should have been lowered in v3d_nir_lower_io");
+ break;
+
+ case nir_intrinsic_load_primitive_id: {
+ /* gl_PrimitiveIdIn is written by the GBG in the first word of
+ * VPM output header. According to docs, we should read this
+ * using ldvpm(v,d)_in (See Table 71).
+ */
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+ break;
+ }
+
+ case nir_intrinsic_load_invocation_id:
+ ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+ break;
+
+ case nir_intrinsic_load_fb_layers_v3d:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+ break;
+
default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);
/* Emit the else block. */
vir_set_emit_block(c, else_block);
- ntq_activate_execute_for_block(c);
ntq_emit_cf_list(c, &if_stmt->else_list);
}
ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
{
switch (instr->type) {
- case nir_instr_type_deref:
- /* ignored, will be walked by the intrinsic using it. */
- break;
-
case nir_instr_type_alu:
ntq_emit_alu(c, nir_instr_as_alu(instr));
break;
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
- /* XXX perf: We could set the "disable implicit point/line
- * varyings" field in the shader record and not emit these, if
- * they're not going to be used.
+ /* V3D 4.x can disable implicit point coordinate varyings if
+ * they are not used.
*/
- if (c->fs_key->is_points) {
+ if (c->fs_key->is_points &&
+ (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
c->point_x = emit_fragment_varying(c, NULL, 0, 0);
c->point_y = emit_fragment_varying(c, NULL, 0, 0);
- } else if (c->fs_key->is_lines) {
+ c->uses_implicit_point_line_varyings = true;
+ } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
c->line_x = emit_fragment_varying(c, NULL, 0, 0);
+ c->uses_implicit_point_line_varyings = true;
}
break;
case MESA_SHADER_COMPUTE:
V3D_QPU_WADDR_SYNC));
}
- if (c->s->info.system_values_read &
- ((1ull << SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) |
- (1ull << SYSTEM_VALUE_WORK_GROUP_ID))) {
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- }
- if ((c->s->info.system_values_read &
- ((1ull << SYSTEM_VALUE_WORK_GROUP_ID))) ||
- c->s->info.cs.shared_size) {
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
- }
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
break;
}
- if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+ if (c->s->scratch_size) {
+ v3d_setup_spill_base(c);
+ c->spill_size += V3D_CHANNELS * c->s->scratch_size;
+ }
+
+ switch (c->s->info.stage) {
+ case MESA_SHADER_VERTEX:
+ ntq_setup_vs_inputs(c);
+ break;
+ case MESA_SHADER_GEOMETRY:
+ ntq_setup_gs_inputs(c);
+ break;
+ case MESA_SHADER_FRAGMENT:
ntq_setup_fs_inputs(c);
- else
- ntq_setup_vpm_inputs(c);
+ break;
+ case MESA_SHADER_COMPUTE:
+ break;
+ default:
+ unreachable("unsupported shader stage");
+ }
ntq_setup_outputs(c);
.lower_all_io_to_temps = true,
.lower_extract_byte = true,
.lower_extract_word = true,
- .lower_bfm = true,
.lower_bitfield_insert_to_shifts = true,
.lower_bitfield_extract_to_shifts = true,
.lower_bitfield_reverse = true,
.lower_bit_count = true,
.lower_cs_local_id_from_index = true,
.lower_ffract = true,
+ .lower_fmod = true,
.lower_pack_unorm_2x16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_unorm_4x8 = true,
.lower_ldexp = true,
.lower_mul_high = true,
.lower_wpos_pntc = true,
- .native_integers = true,
+ .lower_rotate = true,
+ .lower_to_scalar = true,
};
/**
case MESA_SHADER_FRAGMENT:
emit_frag_end(c);
break;
+ case MESA_SHADER_GEOMETRY:
+ emit_geom_end(c);
+ break;
case MESA_SHADER_VERTEX:
emit_vert_end(c);
break;
+ case MESA_SHADER_COMPUTE:
+ break;
default:
unreachable("bad stage");
}
vir_remove_thrsw(c);
}
- if (c->spill_size &&
+ if (c->spills &&
(V3D_DEBUG & (V3D_DEBUG_VIR |
v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
fprintf(stderr, "%s prog %d/%d spilled VIR:\n",