<field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/>
<field name="Address of default attribute values" size="32" start="8b" type="address"/>
<field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
- <field name="2-way threadable" size="1" start="96" type="bool"/>
- <field name="4-way threadable" size="1" start="97" type="bool"/>
+ <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
+ <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
<field name="Propagate NaNs" size="1" start="98" type="bool"/>
<field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
<field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
+ <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
+ <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
<field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
<field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
+ <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
+ <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
(*regs)[i] = c->undef;
}
+static void
+vir_emit_thrsw(struct v3d_compile *c)
+{
+ if (c->threads == 1)
+ return;
+
+ /* Always thread switch after each texture operation for now.
+ *
+ * We could do better by batching a bunch of texture fetches up and
+ * then doing one thread switch and collecting all their results
+ * afterward.
+ */
+ c->last_thrsw = vir_NOP(c);
+ c->last_thrsw->qpu.sig.thrsw = true;
+ c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+}
+
static struct qreg
vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
{
vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
indirect_offset);
+ vir_emit_thrsw(c);
return vir_LDTMU(c);
}
}
}
+ vir_emit_thrsw(c);
+
struct qreg return_values[4];
for (int i = 0; i < 4; i++) {
/* Swizzling .zw of an RG texture should give undefined
ntq_get_src(c, instr->src[1], 0),
vir_uniform_ui(c, i * 4)));
+ vir_emit_thrsw(c);
+
ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
}
break;
}
#endif
+/**
+ * When demoting a shader down to single-threaded, removes the THRSW
+ * instructions (one will still be inserted at v3d_vir_to_qpu() for the
+ * program end).
+ */
+static void
+vir_remove_thrsw(struct v3d_compile *c)
+{
+ vir_for_each_block(block, c) {
+ vir_for_each_inst_safe(inst, block) {
+ if (inst->qpu.sig.thrsw)
+ vir_remove_instruction(c, inst);
+ }
+ }
+
+ c->last_thrsw = NULL;
+}
+
+static void
+vir_emit_last_thrsw(struct v3d_compile *c)
+{
+ /* On V3D before 4.1, we need a TMU op to be outstanding when thread
+ * switching, so disable threads if we didn't do any TMU ops (each of
+ * which would have emitted a THRSW).
+ */
+ if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
+ c->threads = 1;
+ if (c->last_thrsw)
+ vir_remove_thrsw(c);
+ return;
+ }
+
+ /* If we're threaded and the last THRSW was in conditional code, then
+ * we need to emit another one so that we can flag it as the last
+ * thrsw.
+ */
+ if (c->last_thrsw && !c->last_thrsw_at_top_level) {
+ assert(c->devinfo->ver >= 41);
+ vir_emit_thrsw(c);
+ }
+
+ /* If we're threaded, then we need to mark the last THRSW instruction
+ * so we can emit a pair of them at QPU emit time.
+ *
+ * For V3D 4.x, we can spawn the non-fragment shaders already in the
+ * post-last-THRSW state, so we can skip this.
+ */
+ if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ assert(c->devinfo->ver >= 41);
+ vir_emit_thrsw(c);
+ }
+
+ if (c->last_thrsw)
+ c->last_thrsw->is_last_thrsw = true;
+}
+
void
v3d_nir_to_vir(struct v3d_compile *c)
{
nir_to_vir(c);
+ /* Emit the last THRSW before STVPM and TLB writes. */
+ vir_emit_last_thrsw(c);
+
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
emit_frag_end(c);
fprintf(stderr, "\n");
}
- v3d_vir_to_qpu(c);
+ /* Compute the live ranges so we can figure out interference. */
+ vir_calculate_live_intervals(c);
+
+ /* Attempt to allocate registers for the temporaries. If we fail,
+ * reduce thread count and try again.
+ */
+ int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+ struct qpu_reg *temp_registers;
+ while (true) {
+ temp_registers = v3d_register_allocate(c);
+
+ if (temp_registers)
+ break;
+
+ if (c->threads == min_threads) {
+ fprintf(stderr, "Failed to register allocate at %d threads:\n",
+ c->threads);
+ vir_dump(c);
+ c->failed = true;
+ return;
+ }
+
+ c->threads /= 2;
+
+ if (c->threads == 1)
+ vir_remove_thrsw(c);
+ }
+
+ v3d_vir_to_qpu(c, temp_registers);
}
}
static bool
-valid_thrend_sequence(struct v3d_compile *c,
- struct qinst *qinst, int instructions_in_sequence)
+valid_thrsw_sequence(struct v3d_compile *c,
+ struct qinst *qinst, int instructions_in_sequence,
+ bool is_thrend)
{
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_instruction_valid_in_thrend_slot(c, qinst, slot))
+ /* No scheduling SFU when the result would land in the other
+ * thread. The simulator complains for safety, though it
+ * would only occur for dead code in our case.
+ */
+ if (slot > 0 &&
+ qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
+ v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
+ return false;
+ }
+
+ if (slot > 0 && qinst->qpu.sig.ldvary)
return false;
+ if (is_thrend &&
+ !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
+ return false;
+ }
+
/* Note that the list is circular, so we can only do this up
* to instructions_in_sequence.
*/
emit_thrsw(struct v3d_compile *c,
struct qblock *block,
struct choose_scoreboard *scoreboard,
- struct qinst *inst)
+ struct qinst *inst,
+ bool is_thrend)
{
int time = 0;
if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
break;
- if (!valid_thrend_sequence(c, prev_inst, slots_filled + 1))
+ if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
+ is_thrend)) {
break;
+ }
merge_inst = prev_inst;
if (++slots_filled == 3)
break;
}
+ bool needs_free = false;
if (merge_inst) {
merge_inst->qpu.sig.thrsw = true;
+ needs_free = true;
} else {
insert_scheduled_instruction(c, block, scoreboard, inst);
time++;
slots_filled++;
+ merge_inst = inst;
}
/* Insert any extra delay slot NOPs we need. */
time++;
}
+ /* If we're emitting the last THRSW (other than program end), then
+ * signal that to the HW by emitting two THRSWs in a row.
+ */
+ if (inst->is_last_thrsw) {
+ struct qinst *second_inst =
+ (struct qinst *)merge_inst->link.next;
+ second_inst->qpu.sig.thrsw = true;
+ }
+
/* If we put our THRSW into another instruction, free up the
* instruction that didn't end up scheduled into the list.
*/
- if (merge_inst)
+ if (needs_free)
free(inst);
return time;
free(merge->inst);
}
- if (0 && inst->sig.thrsw) {
- /* XXX emit_thrsw(c, scoreboard, qinst); */
+ if (inst->sig.thrsw) {
+ time += emit_thrsw(c, block, scoreboard, qinst, false);
} else {
- c->qpu_inst_count++;
- list_addtail(&qinst->link, &block->instructions);
- update_scoreboard_for_chosen(scoreboard, inst);
- }
-
- scoreboard->tick++;
- time++;
-
- if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
- inst->sig.thrsw /* XXX */) {
- block->branch_qpu_ip = c->qpu_inst_count - 1;
- /* Fill the delay slots.
- *
- * We should fill these with actual instructions,
- * instead, but that will probably need to be done
- * after this, once we know what the leading
- * instructions of the successors are (so we can
- * handle A/B register file write latency)
- */
- /* XXX: scoreboard */
- int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
- 3 : 2);
- for (int i = 0; i < slots; i++) {
- struct qinst *nop = vir_nop();
- list_addtail(&nop->link, &block->instructions);
-
- update_scoreboard_for_chosen(scoreboard,
- &nop->qpu);
- c->qpu_inst_count++;
- scoreboard->tick++;
- time++;
+ insert_scheduled_instruction(c, block,
+ scoreboard, qinst);
+
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+ block->branch_qpu_ip = c->qpu_inst_count - 1;
+ /* Fill the delay slots.
+ *
+ * We should fill these with actual instructions,
+ * instead, but that will probably need to be done
+ * after this, once we know what the leading
+ * instructions of the successors are (so we can
+ * handle A/B register file write latency)
+ */
+ for (int i = 0; i < 3; i++)
+ emit_nop(c, block, scoreboard);
}
}
}
/* Emit the program-end THRSW instruction. */;
struct qinst *thrsw = vir_nop();
thrsw->qpu.sig.thrsw = true;
- emit_thrsw(c, end_block, &scoreboard, thrsw);
+ emit_thrsw(c, end_block, &scoreboard, thrsw, true);
qpu_set_branch_targets(c);
const struct v3d_qpu_instr *last;
int ip;
int last_sfu_write;
+ int last_branch_ip;
+ int last_thrsw_ip;
+ bool last_thrsw_found;
+ int thrsw_count;
};
static void
abort();
}
+static bool
+in_branch_delay_slots(struct v3d_qpu_validate_state *state)
+{
+ return (state->ip - state->last_branch_ip) < 3;
+}
+
+static bool
+in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
+{
+ return (state->ip - state->last_thrsw_ip) < 3;
+}
+
static bool
qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
bool (*predicate)(enum v3d_qpu_waddr waddr))
}
}
+ if (in_thrsw_delay_slots(state)) {
+ /* There's no way you want to start SFU during the THRSW delay
+ * slots, since the result would land in the other thread.
+ */
+ if (sfu_writes) {
+ fail_instr(state,
+ "SFU write started during THRSW delay slots ");
+ }
+
+ if (inst->sig.ldvary)
+ fail_instr(state, "LDVARY during THRSW delay slots");
+ }
+
(void)qpu_magic_waddr_matches; /* XXX */
/* SFU r4 results come back two instructions later. No doing
if (sfu_writes)
state->last_sfu_write = state->ip;
+
+ if (inst->sig.thrsw) {
+ if (in_branch_delay_slots(state))
+ fail_instr(state, "THRSW in a branch delay slot.");
+
+ if (state->last_thrsw_ip == state->ip - 1) {
+ /* If it's the second THRSW in a row, then it's just a
+ * last-thrsw signal.
+ */
+ if (state->last_thrsw_found)
+ fail_instr(state, "Two last-THRSW signals");
+ state->last_thrsw_found = true;
+ } else {
+ if (in_thrsw_delay_slots(state)) {
+ fail_instr(state,
+ "THRSW too close to another THRSW.");
+ }
+ state->thrsw_count++;
+ state->last_thrsw_ip = state->ip;
+ }
+ }
+
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+ if (in_branch_delay_slots(state))
+ fail_instr(state, "branch in a branch delay slot.");
+ if (in_thrsw_delay_slots(state))
+ fail_instr(state, "branch in a THRSW delay slot.");
+ state->last_branch_ip = state->ip;
+ }
}
static void
struct v3d_qpu_validate_state state = {
.c = c,
.last_sfu_write = -10,
+ .last_thrsw_ip = -10,
+ .last_branch_ip = -10,
.ip = 0,
};
vir_for_each_block(block, c) {
qpu_validate_block(&state, block);
}
+
+ if (state.thrsw_count > 1 && !state.last_thrsw_found) {
+ fail_instr(&state,
+ "thread switch found without last-THRSW in program");
+ }
+
+ if (state.thrsw_count == 0 ||
+ (state.last_thrsw_found && state.thrsw_count == 1)) {
+ fail_instr(&state, "No program-end THRSW found");
+ }
}
struct qreg src[3];
bool cond_is_exec_mask;
bool has_implicit_uniform;
+ bool is_last_thrsw;
/* After vir_to_qpu.c: If instr reads a uniform, which uniform from
* the uncompiled stream it is.
uint32_t program_id;
uint32_t variant_id;
- /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
- * is used to hide texturing latency at the cost of limiting ourselves
- * to the bottom half of physical reg space.
+ /* Set to compile program in in 1x, 2x, or 4x threaded mode, where
+ * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
+ * limiting ourselves to the part of the physical reg space.
+ *
+ * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On
+ * V3D 4.x, all shaders are 2x threaded, and 4x only divides the
+ * physical reg space in half.
*/
- bool fs_threaded;
-
+ uint8_t threads;
+ struct qinst *last_thrsw;
bool last_thrsw_at_top_level;
bool failed;
uint32_t ubo_size;
uint8_t num_inputs;
+ uint8_t threads;
+ /* For threads > 1, whether the program should be dispatched in the
+ * after-final-THRSW state.
+ */
+ bool single_seg;
};
struct v3d_vs_prog_data {
void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
void vir_lower_uniforms(struct v3d_compile *c);
-void v3d_vir_to_qpu(struct v3d_compile *c);
+void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
void qpu_validate(struct v3d_compile *c);
struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
}
}
- if (inst->qpu.sig.ldtmu)
+ if (inst->qpu.sig.ldtmu || inst->qpu.sig.thrsw)
return true;
return false;
c->key = key;
c->program_id = program_id;
c->variant_id = variant_id;
+ c->threads = 4;
s = nir_shader_clone(c, s);
c->s = s;
v3d_set_prog_data(struct v3d_compile *c,
struct v3d_prog_data *prog_data)
{
+ prog_data->threads = c->threads;
+ prog_data->single_seg = !c->last_thrsw;
+
v3d_set_prog_data_uniforms(c, prog_data);
v3d_set_prog_data_ubo(c, prog_data);
}
#include "util/ralloc.h"
#include "util/register_allocate.h"
+#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
#define QPU_R(i) { .magic = false, .index = i }
bool
vir_init_reg_sets(struct v3d_compiler *compiler)
{
+ /* Allocate up to 3 regfile classes, for the ways the physical
+ * register file can be divided up for fragment shader threading.
+ */
+ int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+
compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
true);
if (!compiler->regs)
return false;
- /* Allocate 3 regfile classes, for the ways the physical register file
- * can be divided up for fragment shader threading.
- */
- for (int threads = 0; threads < 3; threads++) {
+ for (int threads = 0; threads < max_thread_index; threads++) {
compiler->reg_class_phys_or_acc[threads] =
ra_alloc_reg_class(compiler->regs);
compiler->reg_class_phys[threads] =
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
c->num_temps +
ARRAY_SIZE(acc_nodes));
+ /* Convert 1, 2, 4 threads to 0, 1, 2 index.
+ *
+ * V3D 4.x has double the physical register space, so 64 physical regs
+ * are available at both 1x and 2x threading, and 4x has 32.
+ */
+ int thread_index = ffs(c->threads) - 1;
+ if (c->devinfo->ver >= 40) {
+ if (thread_index >= 1)
+ thread_index--;
+ }
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
}
- /* Compute the live ranges so we can figure out interference. */
- vir_calculate_live_intervals(c);
-
for (uint32_t i = 0; i < c->num_temps; i++) {
map[i].temp = i;
map[i].priority = c->temp_end[i] - c->temp_start[i];
}
}
-#if 0
- switch (inst->op) {
- case QOP_THRSW:
+ if (inst->qpu.sig.thrsw) {
/* All accumulators are invalidated across a thread
* switch.
*/
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
- class_bits[i] &= ~(CLASS_BIT_R0_R3 |
- CLASS_BIT_R4);
+ class_bits[i] &= CLASS_BIT_PHYS;
}
- break;
-
- default:
- break;
}
-#endif
ip++;
}
for (uint32_t i = 0; i < c->num_temps; i++) {
if (class_bits[i] == CLASS_BIT_PHYS) {
ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_phys[c->fs_threaded]);
+ c->compiler->reg_class_phys[thread_index]);
} else {
assert(class_bits[i] == (CLASS_BIT_PHYS |
CLASS_BIT_R0_R2 |
CLASS_BIT_R3 |
CLASS_BIT_R4));
ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_phys_or_acc[c->fs_threaded]);
+ c->compiler->reg_class_phys_or_acc[thread_index]);
}
}
bool ok = ra_allocate(g);
if (!ok) {
- if (!c->fs_threaded) {
- fprintf(stderr, "Failed to register allocate:\n");
- vir_dump(c);
- }
-
- c->failed = true;
free(temp_registers);
return NULL;
}
}
void
-v3d_vir_to_qpu(struct v3d_compile *c)
+v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
{
- struct qpu_reg *temp_registers = v3d_register_allocate(c);
-
/* Reset the uniform count to how many will be actually loaded by the
* generated QPU code.
*/
shader.fragment_shader_uniforms_address = fs_uniforms;
#if V3D_VERSION >= 41
- shader.coordinate_shader_start_in_final_thread_section = true;
- shader.vertex_shader_start_in_final_thread_section = true;
- shader.fragment_shader_start_in_final_thread_section = true;
+ shader.coordinate_shader_4_way_threadable =
+ vc5->prog.cs->prog_data.vs->base.threads == 4;
+ shader.vertex_shader_4_way_threadable =
+ vc5->prog.vs->prog_data.vs->base.threads == 4;
+ shader.fragment_shader_4_way_threadable =
+ vc5->prog.fs->prog_data.fs->base.threads == 4;
+
+ shader.coordinate_shader_start_in_final_thread_section =
+ vc5->prog.cs->prog_data.vs->base.single_seg;
+ shader.vertex_shader_start_in_final_thread_section =
+ vc5->prog.vs->prog_data.vs->base.single_seg;
+ shader.fragment_shader_start_in_final_thread_section =
+ vc5->prog.fs->prog_data.fs->base.single_seg;
+#else
+ shader.coordinate_shader_4_way_threadable =
+ vc5->prog.cs->prog_data.vs->base.threads == 4;
+ shader.coordinate_shader_2_way_threadable =
+ vc5->prog.cs->prog_data.vs->base.threads == 2;
+ shader.vertex_shader_4_way_threadable =
+ vc5->prog.vs->prog_data.vs->base.threads == 4;
+ shader.vertex_shader_2_way_threadable =
+ vc5->prog.vs->prog_data.vs->base.threads == 2;
+ shader.fragment_shader_4_way_threadable =
+ vc5->prog.fs->prog_data.fs->base.threads == 4;
+ shader.fragment_shader_2_way_threadable =
+ vc5->prog.fs->prog_data.fs->base.threads == 2;
#endif
shader.vertex_id_read_by_coordinate_shader =