X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fnir_to_vir.c;h=17359c72ff160f6e869220b1f1def0d5ef16c037;hb=2956d53400fdabe7a52d7ca6154827fea160abf2;hp=86b87837f8ed4423b297409052aeb09f0631a410;hpb=7f106a2b5d0b27c1ce47a4b335c4cc8ae9cd460b;p=mesa.git diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 86b87837f8e..17359c72ff1 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -22,7 +22,8 @@ */ #include -#include "util/u_format.h" +#include "util/format/u_format.h" +#include "util/u_helpers.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/ralloc.h" @@ -409,6 +410,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) return qregs; } +static bool +is_ld_signal(const struct v3d_qpu_sig *sig) +{ + return (sig->ldunif || + sig->ldunifa || + sig->ldunifrf || + sig->ldunifarf || + sig->ldtmu || + sig->ldvary || + sig->ldvpm || + sig->ldtlb || + sig->ldtlbu); +} + /** * This function is responsible for getting VIR results into the associated * storage for a NIR instruction. @@ -456,11 +471,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; - /* Insert a MOV if the source wasn't an SSA def in the - * previous instruction. + /* If the previous instruction can't be predicated for + * the store into the nir_register, then emit a MOV + * that can be. */ - if ((vir_in_nonuniform_control_flow(c) && - c->defs[last_inst->dst.index]->qpu.sig.ldunif)) { + if (vir_in_nonuniform_control_flow(c) && + is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) { result = vir_MOV(c, result); last_inst = c->defs[result.index]; } @@ -721,6 +737,19 @@ emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, } } +static void +emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, + int array_index) +{ + /* Compact variables are scalar arrays where each set of 4 elements + * consumes a single location. + */ + int loc_offset = array_index / 4; + int chan = var->data.location_frac + array_index % 4; + c->inputs[(attr + loc_offset) * 4 + chan] = + emit_fragment_varying(c, var, chan, loc_offset); +} + static void add_output(struct v3d_compile *c, uint32_t decl_offset, @@ -1352,11 +1381,20 @@ emit_frag_end(struct v3d_compile *c) vir_emit_tlb_color_write(c, rt); } +static inline void +vir_VPM_WRITE_indirect(struct v3d_compile *c, + struct qreg val, + struct qreg vpm_index) +{ + assert(c->devinfo->ver >= 40); + vir_STVPMV(c, vpm_index, val); +} + static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { if (c->devinfo->ver >= 40) { - vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); + vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index)); } else { /* XXX: v3d33_vir_vpm_write_setup(c); */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); @@ -1372,6 +1410,15 @@ emit_vert_end(struct v3d_compile *c) vir_VPMWT(c); } +static void +emit_geom_end(struct v3d_compile *c) +{ + /* GFXH-1684: VPM writes need to be complete by the end of the shader. + */ + if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + vir_VPMWT(c); +} + void v3d_optimize_nir(struct nir_shader *s) { @@ -1426,6 +1473,9 @@ driver_location_compare(const void *in_a, const void *in_b) const nir_variable *const *a = in_a; const nir_variable *const *b = in_b; + if ((*a)->data.driver_location == (*b)->data.driver_location) + return (*a)->data.location_frac - (*b)->data.location_frac; + return (*a)->data.driver_location - (*b)->data.driver_location; } @@ -1459,7 +1509,7 @@ ntq_emit_vpm_read(struct v3d_compile *c, } static void -ntq_setup_vpm_inputs(struct v3d_compile *c) +ntq_setup_vs_inputs(struct v3d_compile *c) { /* Figure out how many components of each vertex attribute the shader * uses. Each variable should have been split to individual @@ -1467,7 +1517,7 @@ ntq_setup_vpm_inputs(struct v3d_compile *c) * from the start of the attribute to the number of components we * declare we need in c->vattr_sizes[]. */ - nir_foreach_variable(var, &c->s->inputs) { + nir_foreach_shader_in_variable(var, c->s) { /* No VS attribute array support. */ assert(MAX2(glsl_get_length(var->type), 1) == 1); @@ -1529,48 +1579,83 @@ ntq_setup_vpm_inputs(struct v3d_compile *c) } } -static bool -var_needs_point_coord(struct v3d_compile *c, nir_variable *var) -{ - return (var->data.location == VARYING_SLOT_PNTC || - (var->data.location >= VARYING_SLOT_VAR0 && - (c->fs_key->point_sprite_mask & - (1 << (var->data.location - VARYING_SLOT_VAR0))))); -} - static bool program_reads_point_coord(struct v3d_compile *c) { - nir_foreach_variable(var, &c->s->inputs) { - if (var_needs_point_coord(c, var)) + nir_foreach_shader_in_variable(var, c->s) { + if (util_varying_is_point_coord(var->data.location, + c->fs_key->point_sprite_mask)) { return true; + } } return false; } static void -ntq_setup_fs_inputs(struct v3d_compile *c) +get_sorted_input_variables(struct v3d_compile *c, + unsigned *num_entries, + nir_variable ***vars) { - unsigned num_entries = 0; - unsigned num_components = 0; - nir_foreach_variable(var, &c->s->inputs) { - num_entries++; - num_components += glsl_get_components(var->type); - } + *num_entries = 0; + nir_foreach_shader_in_variable(var, c->s) + (*num_entries)++; - nir_variable *vars[num_entries]; + *vars = ralloc_array(c, nir_variable *, *num_entries); unsigned i = 0; - nir_foreach_variable(var, &c->s->inputs) - vars[i++] = var; + nir_foreach_shader_in_variable(var, c->s) + (*vars)[i++] = var; /* Sort the variables so that we emit the input setup in * driver_location order. This is required for VPM reads, whose data * is fetched into the VPM in driver_location (TGSI register index) * order. */ - qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare); +} + +static void +ntq_setup_gs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + + /* All GS inputs are arrays with as many entries as vertices + * in the input primitive, but here we only care about the + * per-vertex input type. + */ + const struct glsl_type *type = glsl_without_array(var->type); + unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned loc = var->data.driver_location; + + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + array_len) * 4); + + for (unsigned j = 0; j < array_len; j++) { + unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned k = 0; k < num_elements; k++) { + unsigned chan = var->data.location_frac + k; + unsigned input_idx = c->num_inputs++; + struct v3d_varying_slot slot = + v3d_slot_from_slot_and_component(var->data.location + j, chan); + c->input_slots[input_idx] = slot; + } + } + } +} + + +static void +ntq_setup_fs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); for (unsigned i = 0; i < num_entries; i++) { nir_variable *var = vars[i]; @@ -1582,9 +1667,13 @@ ntq_setup_fs_inputs(struct v3d_compile *c) if (var->data.location == VARYING_SLOT_POS) { emit_fragcoord_input(c, loc); - } else if (var_needs_point_coord(c, var)) { + } else if (util_varying_is_point_coord(var->data.location, + c->fs_key->point_sprite_mask)) { c->inputs[loc * 4 + 0] = c->point_x; c->inputs[loc * 4 + 1] = c->point_y; + } else if (var->data.compact) { + for (int j = 0; j < array_len; j++) + emit_compact_fragment_input(c, loc, var, j); } else { for (int j = 0; j < array_len; j++) emit_fragment_input(c, loc + j, var, j); @@ -1598,7 +1687,7 @@ ntq_setup_outputs(struct v3d_compile *c) if (c->s->info.stage != MESA_SHADER_FRAGMENT) return; - nir_foreach_variable(var, &c->s->outputs) { + nir_foreach_shader_out_variable(var, c->s) { unsigned array_len = MAX2(glsl_get_length(var->type), 1); unsigned loc = var->data.driver_location * 4; @@ -1685,17 +1774,17 @@ ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) static void ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) { - assert(instr->intrinsic == nir_intrinsic_image_deref_size); - nir_variable *var = nir_intrinsic_get_var(instr, 0); - unsigned image_index = var->data.driver_location; - const struct glsl_type *sampler_type = glsl_without_array(var->type); - bool is_array = glsl_sampler_type_is_array(sampler_type); + unsigned image_index = nir_src_as_uint(instr->src[0]); + bool is_array = nir_intrinsic_image_array(instr); ntq_store_dest(c, &instr->dest, 0, vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); if (instr->num_components > 1) { ntq_store_dest(c, &instr->dest, 1, - vir_uniform(c, QUNIFORM_IMAGE_HEIGHT, + vir_uniform(c, + instr->num_components == 2 && is_array ? + QUNIFORM_IMAGE_ARRAY_SIZE : + QUNIFORM_IMAGE_HEIGHT, image_index)); } if (instr->num_components > 2) { @@ -1936,6 +2025,68 @@ ntq_emit_color_write(struct v3d_compile *c, } } +static void +emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + assert(instr->num_components == 1); + + struct qreg offset = ntq_get_src(c, instr->src[1], 0); + + uint32_t base_offset = nir_intrinsic_base(instr); + + if (base_offset) + offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset); + + /* Usually, for VS or FS, we only emit outputs once at program end so + * our VPM writes are never in non-uniform control flow, but this + * is not true for GS, where we are emitting multiple vertices. + */ + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + + struct qreg val = ntq_get_src(c, instr->src[0], 0); + + /* The offset isn’t necessarily dynamically uniform for a geometry + * shader. This can happen if the shader sometimes doesn’t emit one of + * the vertices. In that case subsequent vertices will be written to + * different offsets in the VPM and we need to use the scatter write + * instruction to have a different offset for each lane. + */ + if (nir_src_is_dynamically_uniform(instr->src[1])) + vir_VPM_WRITE_indirect(c, val, offset); + else + vir_STVPMD(c, offset, val); + + if (vir_in_nonuniform_control_flow(c)) { + struct qinst *last_inst = + (struct qinst *)c->cur_block->instructions.prev; + vir_set_cond(last_inst, V3D_QPU_COND_IFA); + } +} + +static void +ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + /* XXX perf: Use stvpmv with uniform non-constant offsets and + * stvpmd with non-uniform offsets and enable + * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + ntq_emit_color_write(c, instr); + } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) { + emit_store_output_gs(c, instr); + } else { + assert(c->s->info.stage == MESA_SHADER_VERTEX); + assert(instr->num_components == 1); + + vir_VPM_WRITE(c, + ntq_get_src(c, instr->src[0], 0), + nir_intrinsic_base(instr)); + } +} + static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { @@ -1980,18 +2131,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_tmu_general(c, instr, true); break; - case nir_intrinsic_image_deref_load: - case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: v3d40_vir_emit_image_load_store(c, instr); break; @@ -2002,7 +2153,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_user_clip_plane: - for (int i = 0; i < instr->num_components; i++) { + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { ntq_store_dest(c, &instr->dest, i, vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, nir_intrinsic_ucp_id(instr) * @@ -2035,6 +2186,20 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); break; + case nir_intrinsic_load_line_coord: + ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); + break; + + case nir_intrinsic_load_line_width: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); + break; + + case nir_intrinsic_load_aa_line_width: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); + break; + case nir_intrinsic_load_sample_mask_in: ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); break; @@ -2078,22 +2243,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_store_output: - /* XXX perf: Use stvpmv with uniform non-constant offsets and - * stvpmd with non-uniform offsets and enable - * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. - */ - if (c->s->info.stage == MESA_SHADER_FRAGMENT) { - ntq_emit_color_write(c, instr); - } else { - assert(instr->num_components == 1); - - vir_VPM_WRITE(c, - ntq_get_src(c, instr->src[0], 0), - nir_intrinsic_base(instr)); - } + ntq_emit_store_output(c, instr); break; - case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_size: ntq_emit_image_size(c, instr); break; @@ -2131,10 +2284,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_memory_barrier_tcs_patch: case nir_intrinsic_group_memory_barrier: /* We don't do any instruction scheduling of these NIR * instructions between each other, so we just need to make @@ -2145,7 +2298,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) */ break; - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: /* Emit a TSY op to get all invocations in the workgroup * (actually supergroup) to block until the last invocation * reaches the TSY op. @@ -2202,6 +2355,43 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); break; + case nir_intrinsic_load_per_vertex_input: { + /* col: vertex index, row = varying index */ + struct qreg col = ntq_get_src(c, instr->src[0], 0); + uint32_t row_idx = nir_intrinsic_base(instr) * 4 + + nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + struct qreg row = vir_uniform_ui(c, row_idx++); + ntq_store_dest(c, &instr->dest, i, + vir_LDVPMG_IN(c, row, col)); + } + break; + } + + case nir_intrinsic_emit_vertex: + case nir_intrinsic_end_primitive: + unreachable("Should have been lowered in v3d_nir_lower_io"); + break; + + case nir_intrinsic_load_primitive_id: { + /* gl_PrimitiveIdIn is written by the GBG in the first word of + * VPM output header. According to docs, we should read this + * using ldvpm(v,d)_in (See Table 71). + */ + ntq_store_dest(c, &instr->dest, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + break; + } + + case nir_intrinsic_load_invocation_id: + ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); + break; + + case nir_intrinsic_load_fb_layers_v3d: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); + break; + default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); @@ -2404,10 +2594,6 @@ static void ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) { switch (instr->type) { - case nir_instr_type_deref: - /* ignored, will be walked by the intrinsic using it. */ - break; - case nir_instr_type_alu: ntq_emit_alu(c, nir_instr_as_alu(instr)); break; @@ -2569,7 +2755,10 @@ nir_to_vir(struct v3d_compile *c) c->point_x = emit_fragment_varying(c, NULL, 0, 0); c->point_y = emit_fragment_varying(c, NULL, 0, 0); c->uses_implicit_point_line_varyings = true; - } else if (c->fs_key->is_lines && c->devinfo->ver < 40) { + } else if (c->fs_key->is_lines && + (c->devinfo->ver < 40 || + (c->s->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_LINE_COORD)))) { c->line_x = emit_fragment_varying(c, NULL, 0, 0); c->uses_implicit_point_line_varyings = true; } @@ -2624,10 +2813,21 @@ nir_to_vir(struct v3d_compile *c) c->spill_size += V3D_CHANNELS * c->s->scratch_size; } - if (c->s->info.stage == MESA_SHADER_FRAGMENT) + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + ntq_setup_vs_inputs(c); + break; + case MESA_SHADER_GEOMETRY: + ntq_setup_gs_inputs(c); + break; + case MESA_SHADER_FRAGMENT: ntq_setup_fs_inputs(c); - else - ntq_setup_vpm_inputs(c); + break; + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("unsupported shader stage"); + } ntq_setup_outputs(c); @@ -2773,6 +2973,9 @@ v3d_nir_to_vir(struct v3d_compile *c) case MESA_SHADER_FRAGMENT: emit_frag_end(c); break; + case MESA_SHADER_GEOMETRY: + emit_geom_end(c); + break; case MESA_SHADER_VERTEX: emit_vert_end(c); break; @@ -2828,10 +3031,15 @@ v3d_nir_to_vir(struct v3d_compile *c) break; if (c->threads == min_threads) { - fprintf(stderr, "Failed to register allocate at %d threads:\n", - c->threads); - vir_dump(c); - c->failed = true; + if (c->fallback_scheduler) { + fprintf(stderr, + "Failed to register allocate at %d " + "threads:\n", + c->threads); + vir_dump(c); + } + c->compilation_result = + V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; return; }