case nir_intrinsic_load_ubo:
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
+ case nir_intrinsic_load_scratch:
return GENERAL_TMU_READ_OP_READ;
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
+ case nir_intrinsic_store_scratch:
return GENERAL_TMU_WRITE_OP_WRITE;
case nir_intrinsic_ssbo_atomic_add:
case nir_intrinsic_shared_atomic_add:
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
- bool is_shared)
+ bool is_shared_or_scratch)
{
/* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
* wants to have support for inc/dec?
uint32_t tmu_op = v3d_general_tmu_op(instr);
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+ instr->intrinsic == nir_intrinsic_store_scratch ||
instr->intrinsic == nir_intrinsic_store_shared);
- bool has_index = !is_shared;
+ bool has_index = !is_shared_or_scratch;
int offset_src;
int tmu_writes = 1; /* address */
offset_src = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_scratch ||
instr->intrinsic == nir_intrinsic_load_shared) {
offset_src = 0 + has_index;
} else if (is_store) {
}
}
+ bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
+ uint32_t const_offset = 0;
+ if (!dynamic_src)
+ const_offset = nir_src_as_uint(instr->src[offset_src]);
+
/* Make sure we won't exceed the 16-entry TMU fifo if each thread is
* storing at the same time.
*/
struct qreg offset;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
- offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 0);
-
- /* Find what variable in the default uniform block this
- * uniform load is coming from.
- */
- uint32_t base = nir_intrinsic_base(instr);
- int i;
- struct v3d_ubo_range *range = NULL;
- for (i = 0; i < c->num_ubo_ranges; i++) {
- range = &c->ubo_ranges[i];
- if (base >= range->src_offset &&
- base < range->src_offset + range->size) {
- break;
- }
- }
- /* The driver-location-based offset always has to be within a
- * declared uniform range.
- */
- assert(i != c->num_ubo_ranges);
- if (!c->ubo_range_used[i]) {
- c->ubo_range_used[i] = true;
- range->dst_offset = c->next_ubo_dst_offset;
- c->next_ubo_dst_offset += range->size;
- }
-
- base = base - range->src_offset + range->dst_offset;
-
- if (base != 0)
- offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
+ const_offset += nir_intrinsic_base(instr);
+ offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(0, const_offset));
+ const_offset = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
+ uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
/* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
* 1 (0 is gallium's constant buffer 0).
*/
offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
- nir_src_as_uint(instr->src[0]) + 1);
- } else if (is_shared) {
- /* Shared variables have no buffer index, and all start from a
- * common base that we set up at the start of dispatch
+ v3d_unit_data_create(index, const_offset));
+ const_offset = 0;
+ } else if (is_shared_or_scratch) {
+ /* Shared and scratch variables have no buffer index, and all
+ * start from a common base that we set up at the start of
+ * dispatch.
*/
- offset = c->cs_shared_offset;
+ if (instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_store_scratch) {
+ offset = c->spill_base;
+ } else {
+ offset = c->cs_shared_offset;
+ const_offset += nir_intrinsic_base(instr);
+ }
} else {
offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
nir_src_as_uint(instr->src[is_store ?
1 : 0]));
}
+ /* The spec says that for atomics, the TYPE field is ignored, but that
+ * doesn't seem to be the case for CMPXCHG. Just use the number of
+ * tmud writes we did to decide the type (or choose "32bit" for atomic
+ * reads, which has been fine).
+ */
+ int num_components;
+ if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG)
+ num_components = 2;
+ else
+ num_components = instr->num_components;
+
uint32_t config = (0xffffff00 |
tmu_op |
GENERAL_TMU_LOOKUP_PER_PIXEL);
- if (instr->num_components == 1) {
+ if (num_components == 1) {
config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
} else {
- config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
- instr->num_components - 2);
+ config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2;
}
if (vir_in_nonuniform_control_flow(c)) {
V3D_QPU_PF_PUSHZ);
}
- struct qreg dest;
+ struct qreg tmua;
if (config == ~0)
- dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+ tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
else
- dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+ tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
struct qinst *tmu;
- if (nir_src_is_const(instr->src[offset_src]) &&
- nir_src_as_uint(instr->src[offset_src]) == 0) {
- tmu = vir_MOV_dest(c, dest, offset);
- } else {
- tmu = vir_ADD_dest(c, dest,
- offset,
+ if (dynamic_src) {
+ if (const_offset != 0) {
+ offset = vir_ADD(c, offset,
+ vir_uniform_ui(c, const_offset));
+ }
+ tmu = vir_ADD_dest(c, tmua, offset,
ntq_get_src(c, instr->src[offset_src], 0));
+ } else {
+ if (const_offset != 0) {
+ tmu = vir_ADD_dest(c, tmua, offset,
+ vir_uniform_ui(c, const_offset));
+ } else {
+ tmu = vir_MOV_dest(c, tmua, offset);
+ }
}
if (config != ~0) {
v3d_slot_from_slot_and_component(slot, swizzle);
}
-static void
-declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
-{
- unsigned array_id = c->num_ubo_ranges++;
- if (array_id >= c->ubo_ranges_array_size) {
- c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
- array_id + 1);
- c->ubo_ranges = reralloc(c, c->ubo_ranges,
- struct v3d_ubo_range,
- c->ubo_ranges_array_size);
- c->ubo_range_used = reralloc(c, c->ubo_range_used,
- bool,
- c->ubo_ranges_array_size);
- }
-
- c->ubo_ranges[array_id].dst_offset = 0;
- c->ubo_ranges[array_id].src_offset = start;
- c->ubo_ranges[array_id].size = size;
- c->ubo_range_used[array_id] = false;
-}
-
/**
* If compare_instr is a valid comparison instruction, emits the
* compare_instr's comparison and returns the sel_instr's return value based
struct qreg result;
switch (instr->op) {
- case nir_op_fmov:
- case nir_op_imov:
+ case nir_op_mov:
result = vir_MOV(c, src[0]);
break;
vir_FTOC(c, color[3])));
}
+ struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
+ struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
if (c->output_position_index != -1) {
- struct qinst *inst = vir_MOV_dest(c,
- vir_reg(QFILE_TLBU, 0),
+ struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
c->outputs[c->output_position_index]);
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
*/
c->s->info.fs.uses_discard = true;
- struct qinst *inst = vir_MOV_dest(c,
- vir_reg(QFILE_TLBU, 0),
+ struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
vir_nop_reg());
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
conf |= ((num_components - 1) <<
TLB_VEC_SIZE_MINUS_1_SHIFT);
- inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
+ inst = vir_MOV_dest(c, tlbu_reg, color[0]);
inst->uniform = vir_get_uniform_index(c,
QUNIFORM_CONSTANT,
conf);
for (int i = 1; i < num_components; i++) {
- inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
- color[i]);
+ inst = vir_MOV_dest(c, tlb_reg, color[i]);
}
break;
a = vir_uniform_f(c, 1.0);
if (c->fs_key->f32_color_rb & (1 << rt)) {
- inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r);
+ inst = vir_MOV_dest(c, tlbu_reg, r);
inst->uniform = vir_get_uniform_index(c,
QUNIFORM_CONSTANT,
conf);
if (num_components >= 2)
- vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g);
+ vir_MOV_dest(c, tlb_reg, g);
if (num_components >= 3)
- vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b);
+ vir_MOV_dest(c, tlb_reg, b);
if (num_components >= 4)
- vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a);
+ vir_MOV_dest(c, tlb_reg, a);
} else {
- inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
+ inst = vir_VFPACK_dest(c, tlb_reg, r, g);
if (conf != ~0) {
- inst->dst.file = QFILE_TLBU;
+ inst->dst = tlbu_reg;
inst->uniform = vir_get_uniform_index(c,
QUNIFORM_CONSTANT,
conf);
}
if (num_components >= 3)
- inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
+ inst = vir_VFPACK_dest(c, tlb_reg, b, a);
}
break;
}
v3d_optimize_nir(struct nir_shader *s)
{
bool progress;
+ unsigned lower_flrp =
+ (s->options->lower_flrp16 ? 16 : 0) |
+ (s->options->lower_flrp32 ? 32 : 0) |
+ (s->options->lower_flrp64 ? 64 : 0);
do {
progress = false;
NIR_PASS_V(s, nir_lower_vars_to_ssa);
- NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL);
NIR_PASS(progress, s, nir_lower_phis_to_scalar);
NIR_PASS(progress, s, nir_copy_prop);
NIR_PASS(progress, s, nir_opt_remove_phis);
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
+
+ if (lower_flrp != 0) {
+ bool lower_flrp_progress = false;
+
+ NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
+ lower_flrp,
+ false /* always_precise */,
+ s->options->lower_ffma);
+ if (lower_flrp_progress) {
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ progress = true;
+ }
+
+ /* Nothing should rematerialize any flrps, so we only
+ * need to do this lowering once.
+ */
+ lower_flrp = 0;
+ }
+
NIR_PASS(progress, s, nir_opt_undef);
} while (progress);
}
}
+static bool
+var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
+{
+ return (var->data.location == VARYING_SLOT_PNTC ||
+ (var->data.location >= VARYING_SLOT_VAR0 &&
+ (c->fs_key->point_sprite_mask &
+ (1 << (var->data.location - VARYING_SLOT_VAR0)))));
+}
+
+static bool
+program_reads_point_coord(struct v3d_compile *c)
+{
+ nir_foreach_variable(var, &c->s->inputs) {
+ if (var_needs_point_coord(c, var))
+ return true;
+ }
+
+ return false;
+}
+
static void
ntq_setup_fs_inputs(struct v3d_compile *c)
{
if (var->data.location == VARYING_SLOT_POS) {
emit_fragcoord_input(c, loc);
- } else if (var->data.location == VARYING_SLOT_PNTC ||
- (var->data.location >= VARYING_SLOT_VAR0 &&
- (c->fs_key->point_sprite_mask &
- (1 << (var->data.location -
- VARYING_SLOT_VAR0))))) {
+ } else if (var_needs_point_coord(c, var)) {
c->inputs[loc * 4 + 0] = c->point_x;
c->inputs[loc * 4 + 1] = c->point_y;
} else {
}
}
-static void
-ntq_setup_uniforms(struct v3d_compile *c)
-{
- nir_foreach_variable(var, &c->s->uniforms) {
- uint32_t vec4_count = glsl_count_attribute_slots(var->type,
- false);
- unsigned vec4_size = 4 * sizeof(float);
-
- if (var->data.mode != nir_var_uniform)
- continue;
-
- declare_uniform_range(c, var->data.driver_location * vec4_size,
- vec4_count * vec4_size);
-
- }
-}
-
/**
* Sets up the mapping from nir_register to struct qreg *.
*
*/
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
for (int i = 0; i < instr->def.num_components; i++)
- qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
+ qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
}
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_load_shared:
case nir_intrinsic_store_shared:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_store_scratch:
ntq_emit_tmu_general(c, instr, true);
break;
break;
case nir_intrinsic_load_input:
+ /* Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
+ * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
+ */
offset = (nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
if (c->s->info.stage != MESA_SHADER_FRAGMENT &&
break;
case nir_intrinsic_store_output:
+ /* XXX perf: Use stvpmv with uniform non-constant offsets and
+ * stvpmd with non-uniform offsets and enable
+ * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+ */
if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
offset = ((nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[1])) * 4 +
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
+ case nir_intrinsic_group_memory_barrier:
/* We don't do any instruction scheduling of these NIR
* instructions between each other, so we just need to make
* sure that the TMU operations before the barrier are flushed
vir_uniform_ui(c, 0xffff)));
break;
+ case nir_intrinsic_load_subgroup_id:
+ ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+ break;
+
default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
- /* XXX perf: We could set the "disable implicit point/line
- * varyings" field in the shader record and not emit these, if
- * they're not going to be used.
+ /* V3D 4.x can disable implicit point coordinate varyings if
+ * they are not used.
*/
- if (c->fs_key->is_points) {
+ if (c->fs_key->is_points &&
+ (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
c->point_x = emit_fragment_varying(c, NULL, 0, 0);
c->point_y = emit_fragment_varying(c, NULL, 0, 0);
- } else if (c->fs_key->is_lines) {
+ c->uses_implicit_point_line_varyings = true;
+ } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
c->line_x = emit_fragment_varying(c, NULL, 0, 0);
+ c->uses_implicit_point_line_varyings = true;
}
break;
case MESA_SHADER_COMPUTE:
V3D_QPU_WADDR_SYNC));
}
- if (c->s->info.system_values_read &
- ((1ull << SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) |
- (1ull << SYSTEM_VALUE_WORK_GROUP_ID))) {
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- }
- if ((c->s->info.system_values_read &
- ((1ull << SYSTEM_VALUE_WORK_GROUP_ID))) ||
- c->s->info.cs.shared_size) {
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
- }
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
break;
}
+ if (c->s->scratch_size) {
+ v3d_setup_spill_base(c);
+ c->spill_size += V3D_CHANNELS * c->s->scratch_size;
+ }
+
if (c->s->info.stage == MESA_SHADER_FRAGMENT)
ntq_setup_fs_inputs(c);
else
ntq_setup_vpm_inputs(c);
ntq_setup_outputs(c);
- ntq_setup_uniforms(c);
- ntq_setup_registers(c, &c->s->registers);
/* Find the main function and emit the body. */
nir_foreach_function(function, c->s) {
.lower_bit_count = true,
.lower_cs_local_id_from_index = true,
.lower_ffract = true,
+ .lower_fmod = true,
.lower_pack_unorm_2x16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_unorm_4x8 = true,
.lower_ldexp = true,
.lower_mul_high = true,
.lower_wpos_pntc = true,
- .native_integers = true,
};
/**
case MESA_SHADER_VERTEX:
emit_vert_end(c);
break;
+ case MESA_SHADER_COMPUTE:
+ break;
default:
unreachable("bad stage");
}
vir_remove_thrsw(c);
}
- if (c->spill_size &&
+ if (c->spills &&
(V3D_DEBUG & (V3D_DEBUG_VIR |
v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
fprintf(stderr, "%s prog %d/%d spilled VIR:\n",