break;
case nir_op_unpack_half_2x16_split_x:
+ /* XXX perf: It would be good to be able to merge this unpack
+ * with whatever uses our result.
+ */
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
break;
static void
ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
{
+ /* XXX perf: Experiment with using immediate loads to avoid having
+ * these end up in the uniform stream. Watch out for breaking the
+ * small immediates optimization in the process!
+ */
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
for (int i = 0; i < instr->def.num_components; i++)
qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
for (int i = 0; i < instr->num_components; i++) {
int ubo = nir_src_as_uint(instr->src[0]);
+ /* XXX perf: On V3D 4.x with uniform offsets, we
+ * should probably try setting UBOs up in the A
+ * register file and doing a sequence of loads that
+ * way.
+ */
/* Adjust for where we stored the TGSI register base. */
vir_ADD_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
/* Clears (activates) the execute flags for any channels whose jump target
* matches this block.
+ *
+ * XXX perf: Could we be using flpush/flpop somehow for our execution channel
+ * enabling?
+ *
+ * XXX perf: For uniform control flow, we should be able to skip c->execute
+ * handling entirely.
*/
static void
ntq_activate_execute_for_block(struct v3d_compile *c)
/* Set A for executing (execute == 0) and jumping (if->condition ==
* 0) channels, and then update execute flags for those to point to
* the ELSE block.
+ *
+ * XXX perf: we could reuse ntq_emit_comparison() to generate our if
+ * condition, and the .uf field to ignore non-executing channels, to
+ * reduce the overhead of if statements.
*/
vir_PF(c, vir_OR(c,
c->execute,
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ /* XXX perf: We could set the "disable implicit point/line
+ * varyings" field in the shader record and not emit these, if
+ * they're not going to be used.
+ */
if (c->fs_key->is_points) {
c->point_x = emit_fragment_varying(c, NULL, 0);
c->point_y = emit_fragment_varying(c, NULL, 0);
vir_check_payload_w(c);
- /* XXX: vir_schedule_instructions(c); */
+ /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
+ * We used that on that platform to pipeline TMU writes and reduce the
+ * number of thread switches, as well as try (mostly successfully) to
+ * reduce maximum register pressure to allow more threads. We should
+ * do something of that sort for V3D -- either instruction scheduling
+ * here, or delay the the THRSW and LDTMUs from our texture
+ * instructions until the results are needed.
+ */
if (V3D_DEBUG & (V3D_DEBUG_VIR |
v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
if (!magic) {
add_write_dep(state, &state->last_rf[waddr], n);
} else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+ /* XXX perf: For V3D 4.x, we could reorder TMU writes other
+ * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
+ */
add_write_dep(state, &state->last_tmu_write, n);
switch (waddr) {
case V3D_QPU_WADDR_TMUS:
return next_score;
next_score++;
+ /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
+ * instructions after the producer if possible, not just 1.
+ */
+
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
* sooner. If the ldvary's r5 wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
* r5 in the same tick.
+ *
+ * XXX perf: To get good pipelining of a sequence of varying
+ * loads, we need to figure out how to pair the ldvary signal
+ * up to the instruction before the last r5 user in the
+ * previous ldvary sequence. Currently, it usually pairs with
+ * the last r5 user.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
int *tmu_writes)
{
+ /* XXX perf: We should figure out how to merge ALU operations
+ * producing the val with this MOV, when possible.
+ */
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
(*tmu_writes)++;
/* Limit the number of channels returned to both how many the NIR
* instruction writes and how many the instruction could produce.
+ *
+ * XXX perf: Can we also limit to the number of channels that are
+ * actually read by the users of this NIR dest, so that we don't need
+ * to emit unused LDTMUs?
*/
uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
if (!p1_unpacked.output_type_32_bit)
p1_packed |= unit << 24;
vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
+ /* XXX perf: Can we skip p1 setup for txf ops? */
vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0)
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
chan = return_values[i / 2];
+ /* XXX perf: We should move this unpacking into NIR.
+ * That would give us exposure of these types to NIR
+ * optimization, so that (for example) a repacking of
+ * half-float samples to the half-float render target
+ * could be eliminated.
+ */
if (nir_alu_type_get_base_type(instr->dest_type) ==
nir_type_float) {
enum v3d_qpu_input_unpack unpack;
{
struct v3d_context *v3d = v3d_context(pctx);
+ /* XXX perf: If we're reading from the output of TF in this job, we
+ * should instead be using the wait for transform feedback
+ * functionality.
+ */
+
/* Flush writes to textures we're sampling. */
for (int i = 0; i < v3d->tex[s].num_textures; i++) {
struct pipe_sampler_view *pview = v3d->tex[s].textures[i];
cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
32);
+ /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to
+ * compile time, so that we mostly just have to OR the VS and FS
+ * records together at draw time.
+ */
cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
shader.enable_clipping = true;
/* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
- /* XXX: Use Morton order */
+ /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
+ * performance, but we should use Morton order otherwise to improve
+ * cache locality.
+ */
uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;