#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3)
#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3)
+#define V3D_TSY_SET_QUORUM 0
+#define V3D_TSY_INC_WAITERS 1
+#define V3D_TSY_DEC_WAITERS 2
+#define V3D_TSY_INC_QUORUM 3
+#define V3D_TSY_DEC_QUORUM 4
+#define V3D_TSY_FREE_ALL 5
+#define V3D_TSY_RELEASE 6
+#define V3D_TSY_ACQUIRE 7
+#define V3D_TSY_WAIT 8
+#define V3D_TSY_WAIT_INC 9
+#define V3D_TSY_WAIT_CHECK 10
+#define V3D_TSY_WAIT_INC_CHECK 11
+#define V3D_TSY_WAIT_CV 12
+#define V3D_TSY_INC_SEMAPHORE 13
+#define V3D_TSY_DEC_SEMAPHORE 14
+#define V3D_TSY_SET_QUORUM_FREE_ALL 15
+
static void
ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
*/
c->last_thrsw = vir_NOP(c);
c->last_thrsw->qpu.sig.thrsw = true;
- c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+ c->last_thrsw_at_top_level = !c->in_control_flow;
}
static uint32_t
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_shared:
return GENERAL_TMU_READ_OP_READ;
case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_store_shared:
return GENERAL_TMU_WRITE_OP_WRITE;
case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_shared_atomic_add:
return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_shared_atomic_imin:
return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_shared_atomic_umin:
return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_shared_atomic_imax:
return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_shared_atomic_umax:
return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_shared_atomic_and:
return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_shared_atomic_or:
return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_shared_atomic_xor:
return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_shared_atomic_exchange:
return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
case nir_intrinsic_ssbo_atomic_comp_swap:
+ case nir_intrinsic_shared_atomic_comp_swap:
return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
default:
unreachable("unknown intrinsic op");
* memory access interface.
*/
static void
-ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
+ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
+ bool is_shared)
{
/* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
* wants to have support for inc/dec?
*/
uint32_t tmu_op = v3d_general_tmu_op(instr);
- bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo;
+ bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+ instr->intrinsic == nir_intrinsic_store_shared);
+ bool has_index = !is_shared;
int offset_src;
int tmu_writes = 1; /* address */
if (instr->intrinsic == nir_intrinsic_load_uniform) {
offset_src = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
- instr->intrinsic == nir_intrinsic_load_ubo) {
- offset_src = 1;
+ instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_shared) {
+ offset_src = 0 + has_index;
} else if (is_store) {
- offset_src = 2;
+ offset_src = 1 + has_index;
for (int i = 0; i < instr->num_components; i++) {
vir_MOV_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
tmu_writes++;
}
} else {
- offset_src = 1;
+ offset_src = 0 + has_index;
vir_MOV_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[2], 0));
+ ntq_get_src(c, instr->src[1 + has_index], 0));
tmu_writes++;
if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
vir_MOV_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[3], 0));
+ ntq_get_src(c, instr->src[2 + has_index],
+ 0));
tmu_writes++;
}
}
*/
offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
nir_src_as_uint(instr->src[0]) + 1);
+ } else if (is_shared) {
+ /* Shared variables have no buffer index, and all start from a
+ * common base that we set up at the start of dispatch
+ */
+ offset = c->cs_shared_offset;
} else {
offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
nir_src_as_uint(instr->src[is_store ?
instr->num_components - 2);
}
- if (c->execute.file != QFILE_NULL)
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
struct qreg dest;
if (config == ~0)
vir_uniform_ui(c, config);
}
- if (c->execute.file != QFILE_NULL)
+ if (vir_in_nonuniform_control_flow(c))
vir_set_cond(tmu, V3D_QPU_COND_IFA);
vir_emit_thrsw(c);
/* If we're in control flow, then make this update of the reg
* conditional on the execution mask.
*/
- if (c->execute.file != QFILE_NULL) {
+ if (vir_in_nonuniform_control_flow(c)) {
last_inst->dst.index = qregs[chan].index;
/* Set the flags to the current exec mask.
*/
c->cursor = vir_before_inst(last_inst);
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
c->cursor = vir_after_inst(last_inst);
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
- last_inst->cond_is_exec_mask = true;
}
}
}
struct qreg t = vir_get_temp(c);
vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
- vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
- vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
+ vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
return vir_MOV(c, t);
}
-static struct qreg
-ntq_isign(struct v3d_compile *c, struct qreg src)
-{
- struct qreg t = vir_get_temp(c);
-
- vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
- vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
- vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
- vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
- vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
- return vir_MOV(c, t);
-}
-
static void
emit_fragcoord_input(struct v3d_compile *c, int attr)
{
if (nir_op_infos[compare_instr->op].num_inputs > 1)
src1 = ntq_get_alu_src(c, compare_instr, 1);
bool cond_invert = false;
- struct qreg nop = vir_reg(QFILE_NULL, 0);
+ struct qreg nop = vir_nop_reg();
switch (compare_instr->op) {
case nir_op_feq32:
vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
break;
+ case nir_op_i2b32:
+ vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
+ cond_invert = true;
+ break;
+
+ case nir_op_f2b32:
+ vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
+ cond_invert = true;
+ break;
+
default:
return false;
}
return instr;
}
-/**
- * Attempts to fold a comparison generating a boolean result into the
- * condition code for selecting between two values, instead of comparing the
- * boolean result against 0 to generate the condition code.
- */
-static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
- struct qreg *src)
+/* Turns a NIR bool into a condition code to predicate on. */
+static enum v3d_qpu_cond
+ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
{
- nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src);
+ nir_alu_instr *compare = ntq_get_alu_parent(src);
if (!compare)
goto out;
enum v3d_qpu_cond cond;
if (ntq_emit_comparison(c, compare, &cond))
- return vir_MOV(c, vir_SEL(c, cond, src[1], src[2]));
+ return cond;
out:
- vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
- return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
+ V3D_QPU_PF_PUSHZ);
+ return V3D_QPU_COND_IFNA;
}
-
static void
ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
{
case nir_op_b2i32:
result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
break;
- case nir_op_i2b32:
- case nir_op_f2b32:
- vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
- result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
- vir_uniform_ui(c, ~0),
- vir_uniform_ui(c, 0)));
- break;
case nir_op_iadd:
result = vir_ADD(c, src[0], src[1]);
break;
}
+ case nir_op_i2b32:
+ case nir_op_f2b32:
case nir_op_feq32:
case nir_op_fne32:
case nir_op_fge32:
}
case nir_op_b32csel:
- result = ntq_emit_bcsel(c, instr, src);
+ result = vir_MOV(c,
+ vir_SEL(c,
+ ntq_emit_bool_to_cond(c, instr->src[0].src),
+ src[1], src[2]));
break;
+
case nir_op_fcsel:
- vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]),
+ V3D_QPU_PF_PUSHZ);
result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
src[1], src[2]));
break;
case nir_op_ftrunc:
result = vir_FTRUNC(c, src[0]);
break;
- case nir_op_ffract:
- result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
- break;
case nir_op_fsin:
result = ntq_fsincos(c, src[0], false);
case nir_op_fsign:
result = ntq_fsign(c, src[0]);
break;
- case nir_op_isign:
- result = ntq_isign(c, src[0]);
- break;
case nir_op_fabs: {
result = vir_FMOV(c, src[0]);
}
case nir_op_iabs:
- result = vir_MAX(c, src[0],
- vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
+ result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
break;
case nir_op_fddx:
break;
case nir_op_uadd_carry:
- vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC);
+ vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
+ V3D_QPU_PF_PUSHC);
result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
vir_uniform_ui(c, ~0),
vir_uniform_ui(c, 0)));
break;
case nir_op_unpack_half_2x16_split_x:
- /* XXX perf: It would be good to be able to merge this unpack
- * with whatever uses our result.
- */
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
break;
*/
bool has_any_tlb_color_write = false;
- for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
- if (c->output_color_var[rt])
+ for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
+ if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
has_any_tlb_color_write = true;
}
struct nir_variable *var = c->output_color_var[0];
struct qreg *color = &c->outputs[var->data.driver_location * 4];
- vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+ vir_SETMSF_dest(c, vir_nop_reg(),
vir_AND(c,
vir_MSF(c),
vir_FTOC(c, color[3])));
inst->src[vir_get_implicit_uniform_src(inst)] =
vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+ c->writes_z = true;
} else if (c->s->info.fs.uses_discard ||
+ !c->s->info.fs.early_fragment_tests ||
c->fs_key->sample_alpha_to_coverage ||
!has_any_tlb_color_write) {
/* Emit passthrough Z if it needed to be delayed until shader
struct qinst *inst = vir_MOV_dest(c,
vir_reg(QFILE_TLBU, 0),
- vir_reg(QFILE_NULL, 0));
+ vir_nop_reg());
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
if (c->devinfo->ver >= 42) {
inst->src[vir_get_implicit_uniform_src(inst)] =
vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+ c->writes_z = true;
}
/* XXX: Performance improvement: Merge Z write and color writes TLB
* uniform setup
*/
- for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
- if (!c->output_color_var[rt])
+ for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
+ if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
continue;
nir_variable *var = c->output_color_var[rt];
if (*num_components_queued != 0) {
(*num_components_queued)--;
- c->num_inputs++;
return vir_MOV(c, vpm);
}
*num_components_queued = num_components - 1;
*remaining -= num_components;
- c->num_inputs++;
return vir_MOV(c, vpm);
}
&num_components, ~0);
}
+ /* The actual loads will happen directly in nir_intrinsic_load_input
+ * on newer versions.
+ */
+ if (c->devinfo->ver >= 40)
+ return;
+
for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
(loc + 1) * 4);
offset + i));
}
} else {
- ntq_emit_tmu_general(c, instr);
+ ntq_emit_tmu_general(c, instr, false);
}
break;
case nir_intrinsic_load_ubo:
- ntq_emit_tmu_general(c, instr);
+ ntq_emit_tmu_general(c, instr, false);
break;
case nir_intrinsic_ssbo_atomic_add:
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_store_ssbo:
- ntq_emit_tmu_general(c, instr);
+ ntq_emit_tmu_general(c, instr, false);
+ break;
+
+ case nir_intrinsic_shared_atomic_add:
+ case nir_intrinsic_shared_atomic_imin:
+ case nir_intrinsic_shared_atomic_umin:
+ case nir_intrinsic_shared_atomic_imax:
+ case nir_intrinsic_shared_atomic_umax:
+ case nir_intrinsic_shared_atomic_and:
+ case nir_intrinsic_shared_atomic_or:
+ case nir_intrinsic_shared_atomic_xor:
+ case nir_intrinsic_shared_atomic_exchange:
+ case nir_intrinsic_shared_atomic_comp_swap:
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_store_shared:
+ ntq_emit_tmu_general(c, instr, true);
break;
case nir_intrinsic_image_deref_load:
break;
case nir_intrinsic_load_helper_invocation:
- vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
ntq_store_dest(c, &instr->dest, 0,
vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
vir_uniform_ui(c, ~0),
break;
case nir_intrinsic_load_input:
- for (int i = 0; i < instr->num_components; i++) {
- offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- int comp = nir_intrinsic_component(instr) + i;
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[offset * 4 + comp]));
+ offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[0]));
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT &&
+ c->devinfo->ver >= 40) {
+ /* Emit the LDVPM directly now, rather than at the top
+ * of the shader like we did for V3D 3.x (which needs
+ * vpmsetup when not just taking the next offset).
+ *
+ * Note that delaying like this may introduce stalls,
+ * as LDVPMV takes a minimum of 1 instruction but may
+ * be slower if the VPM unit is busy with another QPU.
+ */
+ int index = 0;
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+ index++;
+ }
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+ index++;
+ }
+ for (int i = 0; i < offset; i++)
+ index += c->vattr_sizes[i];
+ index += nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ struct qreg vpm_offset =
+ vir_uniform_ui(c, index++);
+ ntq_store_dest(c, &instr->dest, i,
+ vir_LDVPMV_IN(c, vpm_offset));
+ }
+ } else {
+ for (int i = 0; i < instr->num_components; i++) {
+ int comp = nir_intrinsic_component(instr) + i;
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, c->inputs[offset * 4 +
+ comp]));
+ }
}
break;
break;
case nir_intrinsic_discard:
- if (c->execute.file != QFILE_NULL) {
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
- vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
vir_uniform_ui(c, 0)),
V3D_QPU_COND_IFA);
} else {
- vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+ vir_SETMSF_dest(c, vir_nop_reg(),
vir_uniform_ui(c, 0));
}
break;
case nir_intrinsic_discard_if: {
- /* true (~0) if we're discarding */
- struct qreg cond = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
- if (c->execute.file != QFILE_NULL) {
- /* execute == 0 means the channel is active. Invert
- * the condition so that we can use zero as "executing
- * and discarding."
- */
- vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
- V3D_QPU_PF_PUSHZ);
- vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
- vir_uniform_ui(c, 0)),
- V3D_QPU_COND_IFA);
- } else {
- vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
- vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
- vir_uniform_ui(c, 0)),
- V3D_QPU_COND_IFNA);
+ if (vir_in_nonuniform_control_flow(c)) {
+ struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
+ c->execute);
+ if (cond == V3D_QPU_COND_IFA) {
+ vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ);
+ } else {
+ vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ);
+ cond = V3D_QPU_COND_IFA;
+ }
}
+ vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
+ vir_uniform_ui(c, 0)), cond);
+
break;
}
case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
+ case nir_intrinsic_memory_barrier_shared:
/* We don't do any instruction scheduling of these NIR
* instructions between each other, so we just need to make
* sure that the TMU operations before the barrier are flushed
*/
break;
+ case nir_intrinsic_barrier:
+ /* Emit a TSY op to get all invocations in the workgroup
+ * (actually supergroup) to block until the last invocation
+ * reaches the TSY op.
+ */
+ if (c->devinfo->ver >= 42) {
+ vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_SYNCB));
+ } else {
+ struct qinst *sync =
+ vir_BARRIERID_dest(c,
+ vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_SYNCU));
+ sync->src[vir_get_implicit_uniform_src(sync)] =
+ vir_uniform_ui(c,
+ 0xffffff00 |
+ V3D_TSY_WAIT_INC_CHECK);
+
+ }
+
+ /* The blocking of a TSY op only happens at the next thread
+ * switch. No texturing may be outstanding at the time of a
+ * TSY blocking operation.
+ */
+ vir_emit_thrsw(c);
+ break;
+
+ case nir_intrinsic_load_num_work_groups:
+ for (int i = 0; i < 3; i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
+ i));
+ }
+ break;
+
+ case nir_intrinsic_load_local_invocation_index:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_SHR(c, c->cs_payload[1],
+ vir_uniform_ui(c, 32 - c->local_invocation_index_bits)));
+ break;
+
+ case nir_intrinsic_load_work_group_id:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_AND(c, c->cs_payload[0],
+ vir_uniform_ui(c, 0xffff)));
+ ntq_store_dest(c, &instr->dest, 1,
+ vir_SHR(c, c->cs_payload[0],
+ vir_uniform_ui(c, 16)));
+ ntq_store_dest(c, &instr->dest, 2,
+ vir_AND(c, c->cs_payload[1],
+ vir_uniform_ui(c, 0xffff)));
+ break;
+
default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);
static void
ntq_activate_execute_for_block(struct v3d_compile *c)
{
- vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0),
+ vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
c->execute, vir_uniform_ui(c, c->cur_block->index)),
V3D_QPU_PF_PUSHZ);
else_block = vir_new_block(c);
/* Set up the flags for the IF condition (taking the THEN branch). */
- nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
- enum v3d_qpu_cond cond;
- if (!if_condition_alu ||
- !ntq_emit_comparison(c, if_condition_alu, &cond)) {
- vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
- V3D_QPU_PF_PUSHZ);
- cond = V3D_QPU_COND_IFNA;
- }
+ enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
/* Jump to ELSE. */
vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
else
else_block = vir_new_block(c);
- bool was_top_level = false;
- if (c->execute.file == QFILE_NULL) {
+ bool was_uniform_control_flow = false;
+ if (!vir_in_nonuniform_control_flow(c)) {
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
- was_top_level = true;
+ was_uniform_control_flow = true;
}
/* Set up the flags for the IF condition (taking the THEN branch). */
- nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
- enum v3d_qpu_cond cond;
- if (!if_condition_alu ||
- !ntq_emit_comparison(c, if_condition_alu, &cond)) {
- vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
- V3D_QPU_PF_PUSHZ);
- cond = V3D_QPU_COND_IFNA;
- }
+ enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
/* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
* was previously active (execute Z) for updating the exec flags.
*/
- if (was_top_level) {
+ if (was_uniform_control_flow) {
cond = v3d_qpu_cond_invert(cond);
} else {
- struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
- c->execute);
+ struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
if (cond == V3D_QPU_COND_IFA) {
vir_set_uf(inst, V3D_QPU_UF_NORNZ);
} else {
/* Jump to ELSE if nothing is active for THEN, otherwise fall
* through.
*/
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
vir_link_blocks(c->cur_block, else_block);
vir_link_blocks(c->cur_block, then_block);
* active channels update their execute flags to point to
* ENDIF
*/
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, after_block->index));
/* If everything points at ENDIF, then jump there immediately. */
- vir_PF(c, vir_XOR(c, c->execute,
- vir_uniform_ui(c, after_block->index)),
- V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
+ c->execute,
+ vir_uniform_ui(c, after_block->index)),
+ V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
vir_link_blocks(c->cur_block, after_block);
vir_link_blocks(c->cur_block, else_block);
vir_link_blocks(c->cur_block, after_block);
vir_set_emit_block(c, after_block);
- if (was_top_level)
+ if (was_uniform_control_flow)
c->execute = c->undef;
else
ntq_activate_execute_for_block(c);
static void
ntq_emit_if(struct v3d_compile *c, nir_if *nif)
{
- if (c->execute.file == QFILE_NULL &&
+ bool was_in_control_flow = c->in_control_flow;
+ c->in_control_flow = true;
+ if (!vir_in_nonuniform_control_flow(c) &&
nir_src_is_dynamically_uniform(nif->condition)) {
ntq_emit_uniform_if(c, nif);
} else {
ntq_emit_nonuniform_if(c, nif);
}
+ c->in_control_flow = was_in_control_flow;
}
static void
{
switch (jump->type) {
case nir_jump_break:
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, c->loop_break_block->index));
break;
case nir_jump_continue:
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, c->loop_cont_block->index));
break;
static void
ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
{
- bool was_top_level = false;
- if (c->execute.file == QFILE_NULL) {
+ bool was_in_control_flow = c->in_control_flow;
+ c->in_control_flow = true;
+
+ bool was_uniform_control_flow = false;
+ if (!vir_in_nonuniform_control_flow(c)) {
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
- was_top_level = true;
+ was_uniform_control_flow = true;
}
struct qblock *save_loop_cont_block = c->loop_cont_block;
*
* XXX: Use the .ORZ flags update, instead.
*/
- vir_PF(c, vir_XOR(c,
- c->execute,
- vir_uniform_ui(c, c->loop_cont_block->index)),
- V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_XOR_dest(c,
+ vir_nop_reg(),
+ c->execute,
+ vir_uniform_ui(c, c->loop_cont_block->index)),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
/* Pixels that were not dispatched or have been discarded should not
vir_link_blocks(c->cur_block, c->loop_break_block);
vir_set_emit_block(c, c->loop_break_block);
- if (was_top_level)
+ if (was_uniform_control_flow)
c->execute = c->undef;
else
ntq_activate_execute_for_block(c);
c->loop_cont_block = save_loop_cont_block;
c->loops++;
+
+ c->in_control_flow = was_in_control_flow;
}
static void
static void
nir_to_vir(struct v3d_compile *c)
{
- if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ switch (c->s->info.stage) {
+ case MESA_SHADER_FRAGMENT:
c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
} else if (c->fs_key->is_lines) {
c->line_x = emit_fragment_varying(c, NULL, 0, 0);
}
+ break;
+ case MESA_SHADER_COMPUTE:
+ /* Set up the TSO for barriers, assuming we do some. */
+ if (c->devinfo->ver < 42) {
+ vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_SYNC));
+ }
+
+ if (c->s->info.system_values_read &
+ ((1ull << SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) |
+ (1ull << SYSTEM_VALUE_WORK_GROUP_ID))) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ }
+ if ((c->s->info.system_values_read &
+ ((1ull << SYSTEM_VALUE_WORK_GROUP_ID))) ||
+ c->s->info.cs.shared_size) {
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ }
+
+ /* Set up the division between gl_LocalInvocationIndex and
+ * wg_in_mem in the payload reg.
+ */
+ int wg_size = (c->s->info.cs.local_size[0] *
+ c->s->info.cs.local_size[1] *
+ c->s->info.cs.local_size[2]);
+ c->local_invocation_index_bits =
+ ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
+ assert(c->local_invocation_index_bits <= 8);
+
+ if (c->s->info.cs.shared_size) {
+ struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
+ vir_uniform_ui(c, 16));
+ if (c->s->info.cs.local_size[0] != 1 ||
+ c->s->info.cs.local_size[1] != 1 ||
+ c->s->info.cs.local_size[2] != 1) {
+ int wg_bits = (16 -
+ c->local_invocation_index_bits);
+ int wg_mask = (1 << wg_bits) - 1;
+ wg_in_mem = vir_AND(c, wg_in_mem,
+ vir_uniform_ui(c, wg_mask));
+ }
+ struct qreg shared_per_wg =
+ vir_uniform_ui(c, c->s->info.cs.shared_size);
+
+ c->cs_shared_offset =
+ vir_ADD(c,
+ vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
+ vir_UMUL(c, wg_in_mem, shared_per_wg));
+ }
+ break;
+ default:
+ break;
}
if (c->s->info.stage == MESA_SHADER_FRAGMENT)
.lower_bitfield_extract_to_shifts = true,
.lower_bitfield_reverse = true,
.lower_bit_count = true,
+ .lower_cs_local_id_from_index = true,
+ .lower_ffract = true,
.lower_pack_unorm_2x16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_unorm_4x8 = true,
.lower_fsat = true,
.lower_fsqrt = true,
.lower_ifind_msb = true,
+ .lower_isign = true,
.lower_ldexp = true,
.lower_mul_high = true,
.lower_wpos_pntc = true,
vir_remove_thrsw(c);
}
+ if (c->spill_size &&
+ (V3D_DEBUG & (V3D_DEBUG_VIR |
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+ fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
+ vir_dump(c);
+ fprintf(stderr, "\n");
+ }
+
v3d_vir_to_qpu(c, temp_registers);
}