c->last_thrsw = vir_NOP(c);
c->last_thrsw->qpu.sig.thrsw = true;
c->last_thrsw_at_top_level = !c->in_control_flow;
+
+ /* We need to lock the scoreboard before any tlb acess happens. If this
+ * thread switch comes after we have emitted a tlb load, then it means
+ * that we can't lock on the last thread switch any more.
+ */
+ if (c->emitted_tlb_load)
+ c->lock_scoreboard_on_first_thrsw = true;
}
static uint32_t
struct qreg result;
switch (instr->op) {
- case nir_op_fmov:
- case nir_op_imov:
+ case nir_op_mov:
result = vir_MOV(c, src[0]);
break;
progress = false;
NIR_PASS_V(s, nir_lower_vars_to_ssa);
- NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL);
NIR_PASS(progress, s, nir_lower_phis_to_scalar);
NIR_PASS(progress, s, nir_copy_prop);
NIR_PASS(progress, s, nir_opt_remove_phis);
}
}
+static bool
+var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
+{
+ return (var->data.location == VARYING_SLOT_PNTC ||
+ (var->data.location >= VARYING_SLOT_VAR0 &&
+ (c->fs_key->point_sprite_mask &
+ (1 << (var->data.location - VARYING_SLOT_VAR0)))));
+}
+
+static bool
+program_reads_point_coord(struct v3d_compile *c)
+{
+ nir_foreach_variable(var, &c->s->inputs) {
+ if (var_needs_point_coord(c, var))
+ return true;
+ }
+
+ return false;
+}
+
static void
ntq_setup_fs_inputs(struct v3d_compile *c)
{
if (var->data.location == VARYING_SLOT_POS) {
emit_fragcoord_input(c, loc);
- } else if (var->data.location == VARYING_SLOT_PNTC ||
- (var->data.location >= VARYING_SLOT_VAR0 &&
- (c->fs_key->point_sprite_mask &
- (1 << (var->data.location -
- VARYING_SLOT_VAR0))))) {
+ } else if (var_needs_point_coord(c, var)) {
c->inputs[loc * 4 + 0] = c->point_x;
c->inputs[loc * 4 + 1] = c->point_y;
} else {
}
}
+static void
+vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+
+ int rt = nir_src_as_uint(instr->src[0]);
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ int sample_index = nir_intrinsic_base(instr) ;
+ assert(sample_index == 0); /* XXX: multisample */
+
+ int component = nir_intrinsic_component(instr);
+ assert(component < 4);
+
+ /* We need to emit our TLB reads after we have acquired the scoreboard
+ * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+ * the last thread switch to improve parallelism, however, that is only
+ * guaranteed to happen before the tlb color writes.
+ *
+ * To fix that, we make sure we always emit a thread switch before the
+ * first tlb color read. If that happens to be the last thread switch
+ * we emit, then everything is fine, but otherwsie, if any code after
+ * this point needs to emit additional thread switches, then we will
+ * switch the strategy to locking the scoreboard on the first thread
+ * switch instead -- see vir_emit_thrsw().
+ */
+ if (!c->emitted_tlb_load) {
+ if (!c->last_thrsw_at_top_level) {
+ assert(c->devinfo->ver >= 41);
+ vir_emit_thrsw(c);
+ }
+
+ c->emitted_tlb_load = true;
+ }
+
+ struct qreg *color_reads =
+ &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
+
+ if (color_reads[component].file == QFILE_NULL) {
+ enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
+ int num_components =
+ util_format_get_nr_components(rt_format);
+
+ const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
+ if (swap_rb)
+ num_components = MAX2(num_components, 3);
+
+ bool is_int_format = (c->fs_key->int_color_rb & (1 << rt)) ||
+ (c->fs_key->uint_color_rb & (1 << rt));
+
+ bool is_32b_tlb_format = is_int_format ||
+ (c->fs_key->f32_color_rb & (1 << rt));
+
+ uint32_t conf = 0xffffff00;
+ conf |= TLB_SAMPLE_MODE_PER_PIXEL; /* XXX: multisample */
+ conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+ if (is_32b_tlb_format) {
+ /* The F32 vs I32 distinction was dropped in 4.2. */
+ conf |= (c->devinfo->ver < 42 && is_int_format) ?
+ TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
+
+ conf |= ((num_components - 1) <<
+ TLB_VEC_SIZE_MINUS_1_SHIFT);
+ } else {
+ conf |= TLB_TYPE_F16_COLOR;
+ conf |= TLB_F16_SWAP_HI_LO;
+
+ if (num_components >= 3)
+ conf |= TLB_VEC_SIZE_4_F16;
+ else
+ conf |= TLB_VEC_SIZE_2_F16;
+ }
+
+ struct qreg r, g, b, a;
+ if (is_32b_tlb_format) {
+ r = conf != 0xffffffff ? vir_TLBU_COLOR_READ(c, conf) :
+ vir_TLB_COLOR_READ(c);
+ if (num_components >= 2)
+ g = vir_TLB_COLOR_READ(c);
+ if (num_components >= 3)
+ b = vir_TLB_COLOR_READ(c);
+ if (num_components >= 4)
+ a = vir_TLB_COLOR_READ(c);
+ } else {
+ struct qreg rg = conf != 0xffffffff ?
+ vir_TLBU_COLOR_READ(c, conf) :
+ vir_TLB_COLOR_READ(c);
+ r = vir_FMOV(c, rg);
+ vir_set_unpack(c->defs[r.index], 0, V3D_QPU_UNPACK_L);
+ g = vir_FMOV(c, rg);
+ vir_set_unpack(c->defs[g.index], 0, V3D_QPU_UNPACK_H);
+
+ if (num_components > 2) {
+ struct qreg ba = vir_TLB_COLOR_READ(c);
+ b = vir_FMOV(c, ba);
+ vir_set_unpack(c->defs[b.index], 0,
+ V3D_QPU_UNPACK_L);
+ a = vir_FMOV(c, ba);
+ vir_set_unpack(c->defs[a.index], 0,
+ V3D_QPU_UNPACK_H);
+ }
+ }
+
+ color_reads[0] = swap_rb ? b : r;
+ if (num_components >= 2)
+ color_reads[1] = g;
+ if (num_components >= 3)
+ color_reads[2] = swap_rb ? r : b;
+ if (num_components >= 4)
+ color_reads[3] = a;
+ }
+
+ assert(color_reads[component].file != QFILE_NULL);
+ ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, color_reads[component]));
+}
+
static void
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
break;
+ case nir_intrinsic_load_tlb_color_v3d:
+ vir_emit_tlb_color_read(c, instr);
+ break;
+
case nir_intrinsic_load_input:
/* Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
* and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
/* Emit the else block. */
vir_set_emit_block(c, else_block);
- ntq_activate_execute_for_block(c);
ntq_emit_cf_list(c, &if_stmt->else_list);
}
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
- /* XXX perf: We could set the "disable implicit point/line
- * varyings" field in the shader record and not emit these, if
- * they're not going to be used.
+ /* V3D 4.x can disable implicit point coordinate varyings if
+ * they are not used.
*/
- if (c->fs_key->is_points) {
+ if (c->fs_key->is_points &&
+ (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
c->point_x = emit_fragment_varying(c, NULL, 0, 0);
c->point_y = emit_fragment_varying(c, NULL, 0, 0);
- } else if (c->fs_key->is_lines) {
+ c->uses_implicit_point_line_varyings = true;
+ } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
c->line_x = emit_fragment_varying(c, NULL, 0, 0);
+ c->uses_implicit_point_line_varyings = true;
}
break;
case MESA_SHADER_COMPUTE:
.lower_all_io_to_temps = true,
.lower_extract_byte = true,
.lower_extract_word = true,
- .lower_bfm = true,
.lower_bitfield_insert_to_shifts = true,
.lower_bitfield_extract_to_shifts = true,
.lower_bitfield_reverse = true,
.lower_bit_count = true,
.lower_cs_local_id_from_index = true,
.lower_ffract = true,
+ .lower_fmod = true,
.lower_pack_unorm_2x16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_unorm_4x8 = true,
.lower_ldexp = true,
.lower_mul_high = true,
.lower_wpos_pntc = true,
+ .lower_rotate = true,
};
/**