X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fvir.c;h=d06f789161fab09e4589e14889d24c16702b86c5;hb=c3258f927c462cc2e24106e7841138452f7fec10;hp=d95a150c720a02e196eed5be1a55f43bbba62471;hpb=1171f1749ddf7fdff0725804868bb375f2407526;p=mesa.git diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index d95a150c720..d06f789161f 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -21,10 +21,13 @@ * IN THE SOFTWARE. */ +#include "broadcom/common/v3d_device_info.h" #include "v3d_compiler.h" +#include "util/u_prim.h" +#include "compiler/nir/nir_schedule.h" int -vir_get_non_sideband_nsrc(struct qinst *inst) +vir_get_nsrc(struct qinst *inst) { switch (inst->qpu.type) { case V3D_QPU_INSTR_TYPE_BRANCH: @@ -39,43 +42,6 @@ vir_get_non_sideband_nsrc(struct qinst *inst) return 0; } -int -vir_get_nsrc(struct qinst *inst) -{ - int nsrc = vir_get_non_sideband_nsrc(inst); - - if (vir_has_implicit_uniform(inst)) - nsrc++; - - return nsrc; -} - -bool -vir_has_implicit_uniform(struct qinst *inst) -{ - switch (inst->qpu.type) { - case V3D_QPU_INSTR_TYPE_BRANCH: - return true; - case V3D_QPU_INSTR_TYPE_ALU: - switch (inst->dst.file) { - case QFILE_TLBU: - return true; - default: - return inst->has_implicit_uniform; - } - } - return false; -} - -/* The sideband uniform for textures gets stored after the normal ALU - * arguments. - */ -int -vir_get_implicit_uniform_src(struct qinst *inst) -{ - return vir_get_nsrc(inst) - 1; -} - /** * Returns whether the instruction has any side effects that must be * preserved. @@ -91,6 +57,11 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) case V3D_QPU_A_SETREVF: case V3D_QPU_A_SETMSF: case V3D_QPU_A_VPMSETUP: + case V3D_QPU_A_STVPMV: + case V3D_QPU_A_STVPMD: + case V3D_QPU_A_STVPMP: + case V3D_QPU_A_VPMWT: + case V3D_QPU_A_TMUWT: return true; default: break; @@ -104,39 +75,13 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) } } - if (inst->qpu.sig.ldtmu) + if (inst->qpu.sig.ldtmu || + inst->qpu.sig.ldvary || + inst->qpu.sig.ldtlbu || + inst->qpu.sig.ldtlb || + inst->qpu.sig.wrtmuc || + inst->qpu.sig.thrsw) { return true; - - return false; -} - -bool -vir_is_float_input(struct qinst *inst) -{ - /* XXX: More instrs */ - switch (inst->qpu.type) { - case V3D_QPU_INSTR_TYPE_BRANCH: - return false; - case V3D_QPU_INSTR_TYPE_ALU: - switch (inst->qpu.alu.add.op) { - case V3D_QPU_A_FADD: - case V3D_QPU_A_FSUB: - case V3D_QPU_A_FMIN: - case V3D_QPU_A_FMAX: - case V3D_QPU_A_FTOIN: - return true; - default: - break; - } - - switch (inst->qpu.alu.mul.op) { - case V3D_QPU_M_FMOV: - case V3D_QPU_M_VFMUL: - case V3D_QPU_M_FMUL: - return true; - default: - break; - } } return false; @@ -156,6 +101,13 @@ vir_is_raw_mov(struct qinst *inst) return false; } + if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || inst->qpu.flags.mc != V3D_QPU_COND_NONE) return false; @@ -183,26 +135,19 @@ vir_is_tex(struct qinst *inst) if (inst->dst.file == QFILE_MAGIC) return v3d_qpu_magic_waddr_is_tmu(inst->dst.index); - return false; -} - -bool -vir_depends_on_flags(struct qinst *inst) -{ - if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { - return (inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS); - } else { - return (inst->qpu.flags.ac != V3D_QPU_COND_NONE && - inst->qpu.flags.mc != V3D_QPU_COND_NONE); + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && + inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { + return true; } + + return false; } bool -vir_writes_r3(struct qinst *inst) +vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) { for (int i = 0; i < vir_get_nsrc(inst); i++) { switch (inst->src[i].file) { - case QFILE_VARY: case QFILE_VPM: return true; default: @@ -210,11 +155,18 @@ vir_writes_r3(struct qinst *inst) } } + if (devinfo->ver < 41 && (inst->qpu.sig.ldvary || + inst->qpu.sig.ldtlb || + inst->qpu.sig.ldtlbu || + inst->qpu.sig.ldvpm)) { + return true; + } + return false; } bool -vir_writes_r4(struct qinst *inst) +vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) { switch (inst->dst.file) { case QFILE_MAGIC: @@ -231,7 +183,7 @@ vir_writes_r4(struct qinst *inst) break; } - if (inst->qpu.sig.ldtmu) + if (devinfo->ver < 41 && inst->qpu.sig.ldtmu) return true; return false; @@ -279,6 +231,17 @@ vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf) } } +void +vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf) +{ + if (vir_is_add(inst)) { + inst->qpu.flags.auf = uf; + } else { + assert(vir_is_mul(inst)); + inst->qpu.flags.muf = uf; + } +} + #if 0 uint8_t vir_channels_written(struct qinst *inst) @@ -339,10 +302,17 @@ vir_get_temp(struct v3d_compile *c) if (c->num_temps > c->defs_array_size) { uint32_t old_size = c->defs_array_size; c->defs_array_size = MAX2(old_size * 2, 16); + c->defs = reralloc(c, c->defs, struct qinst *, c->defs_array_size); memset(&c->defs[old_size], 0, sizeof(c->defs[0]) * (c->defs_array_size - old_size)); + + c->spillable = reralloc(c, c->spillable, + BITSET_WORD, + BITSET_WORDS(c->defs_array_size)); + for (int i = old_size; i < c->defs_array_size; i++) + BITSET_SET(c->spillable, i); } return reg; @@ -381,7 +351,7 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q } struct qinst * -vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src) +vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { struct qinst *inst = calloc(1, sizeof(*inst)); @@ -393,9 +363,8 @@ vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src) inst->qpu.branch.ub = true; inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL; - inst->dst = vir_reg(QFILE_NULL, 0); - inst->src[0] = src; - inst->uniform = ~0; + inst->dst = vir_nop_reg(); + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); return inst; } @@ -403,11 +372,17 @@ vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src) static void vir_emit(struct v3d_compile *c, struct qinst *inst) { - list_addtail(&inst->link, &c->cur_block->instructions); + switch (c->cursor.mode) { + case vir_cursor_add: + list_add(&inst->link, c->cursor.link); + break; + case vir_cursor_addtail: + list_addtail(&inst->link, c->cursor.link); + break; + } - if (inst->dst.file == QFILE_MAGIC && - inst->dst.index == V3D_QPU_WADDR_VPM) - c->num_vpm_writes++; + c->cursor = vir_after_inst(inst); + c->live_intervals_valid = false; } /* Updates inst to write to a new temporary, emits it, and notes the def. */ @@ -416,6 +391,16 @@ vir_emit_def(struct v3d_compile *c, struct qinst *inst) { assert(inst->dst.file == QFILE_NULL); + /* If we're emitting an instruction that's a def, it had better be + * writing a register. + */ + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { + assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP || + v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op)); + assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP || + v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op)); + } + inst->dst = vir_get_temp(c); if (inst->dst.file == QFILE_TEMP) @@ -457,6 +442,7 @@ void vir_set_emit_block(struct v3d_compile *c, struct qblock *block) { c->cur_block = block; + c->cursor = vir_after_block(block); list_addtail(&block->link, &c->blocks); } @@ -511,7 +497,11 @@ static struct v3d_compile * vir_compile_init(const struct v3d_compiler *compiler, struct v3d_key *key, nir_shader *s, - int program_id, int variant_id) + void (*debug_output)(const char *msg, + void *debug_output_data), + void *debug_output_data, + int program_id, int variant_id, + bool fallback_scheduler) { struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); @@ -520,6 +510,11 @@ vir_compile_init(const struct v3d_compiler *compiler, c->key = key; c->program_id = program_id; c->variant_id = variant_id; + c->threads = 4; + c->debug_output = debug_output; + c->debug_output_data = debug_output_data; + c->compilation_result = V3D_COMPILATION_SUCCEEDED; + c->fallback_scheduler = fallback_scheduler; s = nir_shader_clone(c, s); c->s = s; @@ -528,7 +523,6 @@ vir_compile_init(const struct v3d_compiler *compiler, vir_set_emit_block(c, vir_new_block(c)); c->output_position_index = -1; - c->output_point_size_index = -1; c->output_sample_mask_index = -1; c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, @@ -537,12 +531,20 @@ vir_compile_init(const struct v3d_compiler *compiler, return c; } +static int +type_size_vec4(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + static void v3d_lower_nir(struct v3d_compile *c) { struct nir_lower_tex_options tex_options = { .lower_txd = true, - .lower_rect = false, /* XXX */ + .lower_tg4_broadcom_swizzle = true, + + .lower_rect = false, /* XXX: Use this on V3D 3.x */ .lower_txp = ~0, /* Apply swizzles to all samplers. */ .swizzle_result = ~0, @@ -561,17 +563,35 @@ v3d_lower_nir(struct v3d_compile *c) tex_options.saturate_t |= 1 << i; if (c->key->tex[i].clamp_r) tex_options.saturate_r |= 1 << i; + if (c->key->tex[i].return_size == 16) { + tex_options.lower_tex_packing[i] = + nir_lower_tex_packing_16; + } + } + + /* CS textures may not have return_size reflecting the shadow state. */ + nir_foreach_uniform_variable(var, c->s) { + const struct glsl_type *type = glsl_without_array(var->type); + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + + if (!glsl_type_is_sampler(type) || + !glsl_sampler_type_is_shadow(type)) + continue; + + for (int i = 0; i < array_len; i++) { + tex_options.lower_tex_packing[var->data.binding + i] = + nir_lower_tex_packing_16; + } } NIR_PASS_V(c->s, nir_lower_tex, &tex_options); -} + NIR_PASS_V(c->s, nir_lower_system_values); -static void -v3d_lower_nir_late(struct v3d_compile *c) -{ - NIR_PASS_V(c->s, v3d_nir_lower_io, c); - NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); - NIR_PASS_V(c->s, nir_lower_idiv); + NIR_PASS_V(c->s, nir_lower_vars_to_scratch, + nir_var_function_temp, + 0, + glsl_get_natural_size_align_bytes); + NIR_PASS_V(c->s, v3d_nir_lower_scratch); } static void @@ -590,47 +610,172 @@ v3d_set_prog_data_uniforms(struct v3d_compile *c, count * sizeof(*ulist->contents)); } -/* Copy the compiler UBO range state to the compiled shader, dropping out - * arrays that were never referenced by an indirect load. - * - * (Note that QIR dead code elimination of an array access still leaves that - * array alive, though) - */ static void -v3d_set_prog_data_ubo(struct v3d_compile *c, - struct v3d_prog_data *prog_data) +v3d_vs_set_prog_data(struct v3d_compile *c, + struct v3d_vs_prog_data *prog_data) { - if (!c->num_ubo_ranges) - return; + /* The vertex data gets format converted by the VPM so that + * each attribute channel takes up a VPM column. Precompute + * the sizes for the shader record. + */ + for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { + prog_data->vattr_sizes[i] = c->vattr_sizes[i]; + prog_data->vpm_input_size += c->vattr_sizes[i]; + } - prog_data->num_ubo_ranges = 0; - prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range, - c->num_ubo_ranges); - for (int i = 0; i < c->num_ubo_ranges; i++) { - if (!c->ubo_range_used[i]) - continue; + prog_data->uses_vid = (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_VERTEX_ID)); + prog_data->uses_iid = (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_INSTANCE_ID)); + + if (prog_data->uses_vid) + prog_data->vpm_input_size++; + if (prog_data->uses_iid) + prog_data->vpm_input_size++; + + /* Input/output segment size are in sectors (8 rows of 32 bits per + * channel). + */ + prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; + prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; + + /* Set us up for shared input/output segments. This is apparently + * necessary for our VCM setup to avoid varying corruption. + */ + prog_data->separate_segments = false; + prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, + prog_data->vpm_input_size); + prog_data->vpm_input_size = 0; + + /* Compute VCM cache size. We set up our program to take up less than + * half of the VPM, so that any set of bin and render programs won't + * run out of space. We need space for at least one input segment, + * and then allocate the rest to output segments (one for the current + * program, the rest to VCM). The valid range of the VCM cache size + * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4 + * batches. + */ + assert(c->devinfo->vpm_size); + int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; + int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size; + int half_vpm = vpm_size_in_sectors / 2; + int vpm_output_sectors = half_vpm - prog_data->vpm_input_size; + int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size; + assert(vpm_output_batches >= 2); + prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4); +} + +static void +v3d_gs_set_prog_data(struct v3d_compile *c, + struct v3d_gs_prog_data *prog_data) +{ + prog_data->num_inputs = c->num_inputs; + memcpy(prog_data->input_slots, c->input_slots, + c->num_inputs * sizeof(*c->input_slots)); - struct v3d_ubo_range *range = &c->ubo_ranges[i]; - prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range; - prog_data->ubo_size += range->size; + /* gl_PrimitiveIdIn is written by the GBG into the first word of the + * VPM output header automatically and the shader will overwrite + * it after reading it if necessary, so it doesn't add to the VPM + * size requirements. + */ + prog_data->uses_pid = (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_PRIMITIVE_ID)); + + /* Output segment size is in sectors (8 rows of 32 bits per channel) */ + prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; + + /* Compute SIMD dispatch width and update VPM output size accordingly + * to ensure we can fit our program in memory. Available widths are + * 16, 8, 4, 1. + * + * Notice that at draw time we will have to consider VPM memory + * requirements from other stages and choose a smaller dispatch + * width if needed to fit the program in VPM memory. + */ + prog_data->simd_width = 16; + while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) || + prog_data->simd_width == 2) { + prog_data->simd_width >>= 1; + prog_data->vpm_output_size = + align(prog_data->vpm_output_size, 2) / 2; } + assert(prog_data->vpm_output_size <= 16); + assert(prog_data->simd_width != 2); - if (prog_data->ubo_size) { - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - prog_data->ubo_size / 4); - } + prog_data->out_prim_type = c->s->info.gs.output_primitive; + prog_data->num_invocations = c->s->info.gs.invocations; +} + +static void +v3d_set_fs_prog_data_inputs(struct v3d_compile *c, + struct v3d_fs_prog_data *prog_data) +{ + prog_data->num_inputs = c->num_inputs; + memcpy(prog_data->input_slots, c->input_slots, + c->num_inputs * sizeof(*c->input_slots)); + + STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) > + (V3D_MAX_FS_INPUTS - 1) / 24); + for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) { + if (BITSET_TEST(c->flat_shade_flags, i)) + prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24); + + if (BITSET_TEST(c->noperspective_flags, i)) + prog_data->noperspective_flags[i / 24] |= 1 << (i % 24); + + if (BITSET_TEST(c->centroid_flags, i)) + prog_data->centroid_flags[i / 24] |= 1 << (i % 24); } } +static void +v3d_fs_set_prog_data(struct v3d_compile *c, + struct v3d_fs_prog_data *prog_data) +{ + v3d_set_fs_prog_data_inputs(c, prog_data); + prog_data->writes_z = c->writes_z; + prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; + prog_data->uses_center_w = c->uses_center_w; + prog_data->uses_implicit_point_line_varyings = + c->uses_implicit_point_line_varyings; + prog_data->lock_scoreboard_on_first_thrsw = + c->lock_scoreboard_on_first_thrsw; +} + +static void +v3d_cs_set_prog_data(struct v3d_compile *c, + struct v3d_compute_prog_data *prog_data) +{ + prog_data->shared_size = c->s->info.cs.shared_size; +} + static void v3d_set_prog_data(struct v3d_compile *c, struct v3d_prog_data *prog_data) { + prog_data->threads = c->threads; + prog_data->single_seg = !c->last_thrsw; + prog_data->spill_size = c->spill_size; + prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; + v3d_set_prog_data_uniforms(c, prog_data); - v3d_set_prog_data_ubo(c, prog_data); + + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); + break; + case MESA_SHADER_GEOMETRY: + v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data); + break; + case MESA_SHADER_FRAGMENT: + v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data); + break; + case MESA_SHADER_COMPUTE: + v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); + break; + default: + unreachable("unsupported shader stage"); + } } static uint64_t * @@ -649,123 +794,451 @@ v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size) return qpu_insts; } -uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler, - struct v3d_vs_key *key, - struct v3d_vs_prog_data *prog_data, - nir_shader *s, - int program_id, int variant_id, - uint32_t *final_assembly_size) +static void +v3d_nir_lower_vs_early(struct v3d_compile *c) { - struct v3d_compile *c = vir_compile_init(compiler, &key->base, s, - program_id, variant_id); + /* Split our I/O vars and dead code eliminate the unused + * components. + */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); + uint64_t used_outputs[4] = {0}; + for (int i = 0; i < c->vs_key->num_used_outputs; i++) { + int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); + int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); + used_outputs[comp] |= 1ull << slot; + } + NIR_PASS_V(c->s, nir_remove_unused_io_vars, + nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ + NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); - c->vs_key = key; + /* This must go before nir_lower_io */ + if (c->vs_key->per_vertex_point_size) + NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); - v3d_lower_nir(c); + NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains and do a constant folding pass + * on the code it generated. + */ + NIR_PASS_V(c->s, nir_opt_dce); + NIR_PASS_V(c->s, nir_opt_constant_folding); +} - if (key->clamp_color) +static void +v3d_nir_lower_gs_early(struct v3d_compile *c) +{ + /* Split our I/O vars and dead code eliminate the unused + * components. + */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); + uint64_t used_outputs[4] = {0}; + for (int i = 0; i < c->gs_key->num_used_outputs; i++) { + int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); + int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); + used_outputs[comp] |= 1ull << slot; + } + NIR_PASS_V(c->s, nir_remove_unused_io_vars, + nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ + NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); + + /* This must go before nir_lower_io */ + if (c->gs_key->per_vertex_point_size) + NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + + NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains */ + NIR_PASS_V(c->s, nir_opt_dce); +} + +static void +v3d_fixup_fs_output_types(struct v3d_compile *c) +{ + nir_foreach_shader_out_variable(var, c->s) { + uint32_t mask = 0; + + switch (var->data.location) { + case FRAG_RESULT_COLOR: + mask = ~0; + break; + case FRAG_RESULT_DATA0: + case FRAG_RESULT_DATA1: + case FRAG_RESULT_DATA2: + case FRAG_RESULT_DATA3: + mask = 1 << (var->data.location - FRAG_RESULT_DATA0); + break; + } + + if (c->fs_key->int_color_rb & mask) { + var->type = + glsl_vector_type(GLSL_TYPE_INT, + glsl_get_components(var->type)); + } else if (c->fs_key->uint_color_rb & mask) { + var->type = + glsl_vector_type(GLSL_TYPE_UINT, + glsl_get_components(var->type)); + } + } +} + +static void +v3d_nir_lower_fs_early(struct v3d_compile *c) +{ + if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) + v3d_fixup_fs_output_types(c); + + NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c); + + if (c->fs_key->line_smoothing) { + v3d_nir_lower_line_smooth(c->s); + NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + /* The lowering pass can introduce new sysval reads */ + nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s)); + } + + /* If the shader has no non-TLB side effects, we can promote it to + * enabling early_fragment_tests even if the user didn't. + */ + if (!(c->s->info.num_images || + c->s->info.num_ssbos)) { + c->s->info.fs.early_fragment_tests = true; + } +} + +static void +v3d_nir_lower_gs_late(struct v3d_compile *c) +{ + if (c->key->ucp_enables) { + NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, + false, NULL); + } + + /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); +} + +static void +v3d_nir_lower_vs_late(struct v3d_compile *c) +{ + if (c->vs_key->clamp_color) NIR_PASS_V(c->s, nir_lower_clamp_color_outputs); - if (key->base.ucp_enables) { - NIR_PASS_V(c->s, nir_lower_clip_vs, key->base.ucp_enables); + if (c->key->ucp_enables) { + NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables, + false, false, NULL); NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); } /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); +} - v3d_lower_nir_late(c); - v3d_optimize_nir(c->s); - NIR_PASS_V(c->s, nir_convert_from_ssa, true); +static void +v3d_nir_lower_fs_late(struct v3d_compile *c) +{ + if (c->fs_key->light_twoside) + NIR_PASS_V(c->s, nir_lower_two_sided_color, true); - v3d_nir_to_vir(c); + if (c->fs_key->clamp_color) + NIR_PASS_V(c->s, nir_lower_clamp_color_outputs); - v3d_set_prog_data(c, &prog_data->base); + if (c->fs_key->alpha_test) { + NIR_PASS_V(c->s, nir_lower_alpha_test, + c->fs_key->alpha_test_func, + false, NULL); + } - prog_data->base.num_inputs = c->num_inputs; + /* In OpenGL the fragment shader can't read gl_ClipDistance[], but + * Vulkan allows it, in which case the SPIR-V compiler will declare + * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as + * the last parameter to always operate with a compact array in both + * OpenGL and Vulkan so we do't have to care about the API we + * are using. + */ + if (c->key->ucp_enables) + NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true); - /* The vertex data gets format converted by the VPM so that - * each attribute channel takes up a VPM column. Precompute - * the sizes for the shader record. + /* Note: FS input scalarizing must happen after + * nir_lower_two_sided_color, which only handles a vec4 at a time. */ - for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { - prog_data->vattr_sizes[i] = c->vattr_sizes[i]; - prog_data->vpm_input_size += c->vattr_sizes[i]; + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); +} + +static uint32_t +vir_get_max_temps(struct v3d_compile *c) +{ + int max_ip = 0; + vir_for_each_inst_inorder(inst, c) + max_ip++; + + uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip); + + for (int t = 0; t < c->num_temps; t++) { + for (int i = c->temp_start[t]; (i < c->temp_end[t] && + i < max_ip); i++) { + if (i > max_ip) + break; + pressure[i]++; + } } - /* Input/output segment size are in 8x32-bit multiples. */ - prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; - prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8; + uint32_t max_temps = 0; + for (int i = 0; i < max_ip; i++) + max_temps = MAX2(max_temps, pressure[i]); - prog_data->uses_vid = (s->info.system_values_read & - (1ull << SYSTEM_VALUE_VERTEX_ID)); - prog_data->uses_iid = (s->info.system_values_read & - (1ull << SYSTEM_VALUE_INSTANCE_ID)); + ralloc_free(pressure); - return v3d_return_qpu_insts(c, final_assembly_size); + return max_temps; } -static void -v3d_set_fs_prog_data_inputs(struct v3d_compile *c, - struct v3d_fs_prog_data *prog_data) +enum v3d_dependency_class { + V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0 +}; + +static bool +v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, + nir_schedule_dependency *dep, + void *user_data) { - prog_data->base.num_inputs = c->num_inputs; - memcpy(prog_data->input_slots, c->input_slots, - c->num_inputs * sizeof(*c->input_slots)); + struct v3d_compile *c = user_data; + + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + /* Writing to location 0 overwrites the value passed in for + * gl_PrimitiveID on geometry shaders + */ + if (c->s->info.stage != MESA_SHADER_GEOMETRY || + nir_intrinsic_base(intr) != 0) + break; + + nir_const_value *const_value = + nir_src_as_const_value(intr->src[1]); + + if (const_value == NULL) + break; + + uint64_t offset = + nir_const_value_as_uint(*const_value, + nir_src_bit_size(intr->src[1])); + if (offset != 0) + break; + + dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; + dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY; + return true; + + case nir_intrinsic_load_primitive_id: + if (c->s->info.stage != MESA_SHADER_GEOMETRY) + break; + + dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; + dep->type = NIR_SCHEDULE_READ_DEPENDENCY; + return true; - memcpy(prog_data->flat_shade_flags, c->flat_shade_flags, - sizeof(c->flat_shade_flags)); - memcpy(prog_data->shade_model_flags, c->shade_model_flags, - sizeof(c->shade_model_flags)); + default: + break; + } + + return false; } -uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler, - struct v3d_fs_key *key, - struct v3d_fs_prog_data *prog_data, - nir_shader *s, - int program_id, int variant_id, - uint32_t *final_assembly_size) +static void +v3d_attempt_compile(struct v3d_compile *c) { - struct v3d_compile *c = vir_compile_init(compiler, &key->base, s, - program_id, variant_id); + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + c->vs_key = (struct v3d_vs_key *) c->key; + break; + case MESA_SHADER_GEOMETRY: + c->gs_key = (struct v3d_gs_key *) c->key; + break; + case MESA_SHADER_FRAGMENT: + c->fs_key = (struct v3d_fs_key *) c->key; + break; + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("unsupported shader stage"); + } - c->fs_key = key; + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + v3d_nir_lower_vs_early(c); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_lower_gs_early(c); + break; + case MESA_SHADER_FRAGMENT: + v3d_nir_lower_fs_early(c); + break; + default: + break; + } v3d_lower_nir(c); - if (key->light_twoside) - NIR_PASS_V(c->s, nir_lower_two_sided_color); - - if (key->clamp_color) - NIR_PASS_V(c->s, nir_lower_clamp_color_outputs); - - if (key->alpha_test) { - NIR_PASS_V(c->s, nir_lower_alpha_test, key->alpha_test_func, - false); + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + v3d_nir_lower_vs_late(c); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_lower_gs_late(c); + break; + case MESA_SHADER_FRAGMENT: + v3d_nir_lower_fs_late(c); + break; + default: + break; } - if (key->base.ucp_enables) - NIR_PASS_V(c->s, nir_lower_clip_fs, key->base.ucp_enables); + NIR_PASS_V(c->s, v3d_nir_lower_io, c); + NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); + NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); + NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast); - /* Note: FS input scalarizing must happen after - * nir_lower_two_sided_color, which only handles a vec4 at a time. + v3d_optimize_nir(c->s); + + /* Do late algebraic optimization to turn add(a, neg(b)) back into + * subs, then the mandatory cleanup after algebraic. Note that it may + * produce fnegs, and if so then we need to keep running to squash + * fneg(fneg(a)). */ - NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); + bool more_late_algebraic = true; + while (more_late_algebraic) { + more_late_algebraic = false; + NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); + NIR_PASS_V(c->s, nir_opt_constant_folding); + NIR_PASS_V(c->s, nir_copy_prop); + NIR_PASS_V(c->s, nir_opt_dce); + NIR_PASS_V(c->s, nir_opt_cse); + } - v3d_lower_nir_late(c); - v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_lower_bool_to_int32); NIR_PASS_V(c->s, nir_convert_from_ssa, true); + struct nir_schedule_options schedule_options = { + /* Schedule for about half our register space, to enable more + * shaders to hit 4 threads. + */ + .threshold = 24, + + /* Vertex shaders share the same memory for inputs and outputs, + * fragement and geometry shaders do not. + */ + .stages_with_shared_io_memory = + (((1 << MESA_ALL_SHADER_STAGES) - 1) & + ~((1 << MESA_SHADER_FRAGMENT) | + (1 << MESA_SHADER_GEOMETRY))), + + .fallback = c->fallback_scheduler, + + .intrinsic_cb = v3d_intrinsic_dependency_cb, + .intrinsic_cb_data = c, + }; + NIR_PASS_V(c->s, nir_schedule, &schedule_options); + v3d_nir_to_vir(c); +} - v3d_set_prog_data(c, &prog_data->base); - v3d_set_fs_prog_data_inputs(c, prog_data); - prog_data->writes_z = (c->s->info.outputs_written & - (1 << FRAG_RESULT_DEPTH)); - prog_data->discard = c->s->info.fs.uses_discard; +uint32_t +v3d_prog_data_size(gl_shader_stage stage) +{ + static const int prog_data_size[] = { + [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data), + [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data), + [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data), + [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data), + }; + + assert(stage >= 0 && + stage < ARRAY_SIZE(prog_data_size) && + prog_data_size[stage]); + + return prog_data_size[stage]; +} + +uint64_t *v3d_compile(const struct v3d_compiler *compiler, + struct v3d_key *key, + struct v3d_prog_data **out_prog_data, + nir_shader *s, + void (*debug_output)(const char *msg, + void *debug_output_data), + void *debug_output_data, + int program_id, int variant_id, + uint32_t *final_assembly_size) +{ + struct v3d_compile *c; + + for (int i = 0; true; i++) { + c = vir_compile_init(compiler, key, s, + debug_output, debug_output_data, + program_id, variant_id, + i > 0 /* fallback_scheduler */); + + v3d_attempt_compile(c); + + if (i > 0 || + c->compilation_result != + V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) + break; + + char *debug_msg; + int ret = asprintf(&debug_msg, + "Using fallback scheduler for %s", + vir_get_stage_name(c)); + + if (ret >= 0) { + if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) + fprintf(stderr, "%s\n", debug_msg); + + c->debug_output(debug_msg, c->debug_output_data); + free(debug_msg); + } + + vir_compile_destroy(c); + } + + struct v3d_prog_data *prog_data; + + prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage)); + + v3d_set_prog_data(c, prog_data); + + *out_prog_data = prog_data; + + char *shaderdb; + int ret = asprintf(&shaderdb, + "%s shader: %d inst, %d threads, %d loops, " + "%d uniforms, %d max-temps, %d:%d spills:fills, " + "%d sfu-stalls, %d inst-and-stalls", + vir_get_stage_name(c), + c->qpu_inst_count, + c->threads, + c->loops, + c->num_uniforms, + vir_get_max_temps(c), + c->spills, + c->fills, + c->qpu_inst_stalled_count, + c->qpu_inst_count + c->qpu_inst_stalled_count); + if (ret >= 0) { + if (V3D_DEBUG & V3D_DEBUG_SHADERDB) + fprintf(stderr, "SHADER-DB: %s\n", shaderdb); + + c->debug_output(shaderdb, c->debug_output_data); + free(shaderdb); + } - return v3d_return_qpu_insts(c, final_assembly_size); + return v3d_return_qpu_insts(c, final_assembly_size); } void @@ -774,8 +1247,12 @@ vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst) if (qinst->dst.file == QFILE_TEMP) c->defs[qinst->dst.index] = NULL; + assert(&qinst->link != c->cursor.link); + list_del(&qinst->link); free(qinst); + + c->live_intervals_valid = false; } struct qreg @@ -801,8 +1278,12 @@ vir_follow_movs(struct v3d_compile *c, struct qreg reg) void vir_compile_destroy(struct v3d_compile *c) { + /* Defuse the assert that we aren't removing the cursor's instruction. + */ + c->cursor.link = NULL; + vir_for_each_block(block, c) { - while (!list_empty(&block->instructions)) { + while (!list_is_empty(&block->instructions)) { struct qinst *qinst = list_first_entry(&block->instructions, struct qinst, link); @@ -813,15 +1294,15 @@ vir_compile_destroy(struct v3d_compile *c) ralloc_free(c); } -struct qreg -vir_uniform(struct v3d_compile *c, - enum quniform_contents contents, - uint32_t data) +uint32_t +vir_get_uniform_index(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data) { for (int i = 0; i < c->num_uniforms; i++) { if (c->uniform_contents[i] == contents && c->uniform_data[i] == data) { - return vir_reg(QFILE_UNIF, i); + return i; } } @@ -842,26 +1323,20 @@ vir_uniform(struct v3d_compile *c, c->uniform_contents[uniform] = contents; c->uniform_data[uniform] = data; - return vir_reg(QFILE_UNIF, uniform); + return uniform; } -void -vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf) +struct qreg +vir_uniform(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data) { - struct qinst *last_inst = NULL; - - if (!list_empty(&c->cur_block->instructions)) - last_inst = (struct qinst *)c->cur_block->instructions.prev; - - if (src.file != QFILE_TEMP || - !c->defs[src.index] || - last_inst != c->defs[src.index]) { - /* XXX: Make the MOV be the appropriate type */ - last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src); - last_inst = (struct qinst *)c->cur_block->instructions.prev; - } - - vir_set_pf(last_inst, pf); + struct qinst *inst = vir_NOP(c); + inst->qpu.sig.ldunif = true; + inst->uniform = vir_get_uniform_index(c, contents, data); + inst->dst = vir_get_temp(c); + c->defs[inst->dst.index] = inst; + return inst->dst; } #define OPTPASS(func) \ @@ -888,7 +1363,9 @@ vir_optimize(struct v3d_compile *c) bool progress = false; OPTPASS(vir_opt_copy_propagate); + OPTPASS(vir_opt_redundant_flags); OPTPASS(vir_opt_dead_code); + OPTPASS(vir_opt_small_immediates); if (!progress) break; @@ -901,7 +1378,9 @@ const char * vir_get_stage_name(struct v3d_compile *c) { if (c->vs_key && c->vs_key->is_coord) - return "MESA_SHADER_COORD"; + return "MESA_SHADER_VERTEX_BIN"; + else if (c->gs_key && c->gs_key->is_coord) + return "MESA_SHADER_GEOMETRY_BIN"; else return gl_shader_stage_name(c->s->info.stage); }