From: Jason Ekstrand Date: Wed, 25 Jul 2018 02:32:27 +0000 (-0700) Subject: nir: Add a array-of-vector variable shrinking pass X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=be8d009908ed32e3d15a9192b67a2020b99496a8;p=mesa.git nir: Add a array-of-vector variable shrinking pass This pass looks for variables with vector or array-of-vector types and narrows the type to only the components used. Reviewed-by: Caio Marcelo de Oliveira Filho --- diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 84d197028e5..442a1c9c79f 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2659,6 +2659,7 @@ void nir_dump_cfg(nir_shader *shader, FILE *fp); int nir_gs_count_vertices(const nir_shader *shader); +bool nir_shrink_vec_array_vars(nir_shader *shader, nir_variable_mode modes); bool nir_split_array_vars(nir_shader *shader, nir_variable_mode modes); bool nir_split_var_copies(nir_shader *shader); bool nir_split_per_member_structs(nir_shader *shader); diff --git a/src/compiler/nir/nir_split_vars.c b/src/compiler/nir/nir_split_vars.c index 788ca83b5e5..6cd835cb013 100644 --- a/src/compiler/nir/nir_split_vars.c +++ b/src/compiler/nir/nir_split_vars.c @@ -26,6 +26,9 @@ #include "nir_deref.h" #include "nir_vla.h" +/* Needed for _mesa_bitcount() */ +#include "main/macros.h" + struct split_var_state { void *mem_ctx; @@ -856,3 +859,717 @@ nir_split_array_vars(nir_shader *shader, nir_variable_mode modes) return progress; } + +struct array_level_usage { + unsigned array_len; + + /* The value UINT_MAX will be used to indicate an indirect */ + unsigned max_read; + unsigned max_written; + + /* True if there is a copy that isn't to/from a shrinkable array */ + bool has_external_copy; + struct set *levels_copied; +}; + +struct vec_var_usage { + /* Convenience set of all components this variable has */ + nir_component_mask_t all_comps; + + nir_component_mask_t comps_read; + nir_component_mask_t comps_written; + + nir_component_mask_t comps_kept; + + /* True if there is a copy that isn't to/from a shrinkable vector */ + bool has_external_copy; + struct set *vars_copied; + + unsigned num_levels; + struct array_level_usage levels[0]; +}; + +static struct vec_var_usage * +get_vec_var_usage(nir_variable *var, + struct hash_table *var_usage_map, + bool add_usage_entry, void *mem_ctx) +{ + struct hash_entry *entry = _mesa_hash_table_search(var_usage_map, var); + if (entry) + return entry->data; + + if (!add_usage_entry) + return NULL; + + /* Check to make sure that we are working with an array of vectors. We + * don't bother to shrink single vectors because we figure that we can + * clean it up better with SSA than by inserting piles of vecN instructions + * to compact results. + */ + int num_levels = num_array_levels_in_array_of_vector_type(var->type); + if (num_levels < 1) + return NULL; /* Not an array of vectors */ + + struct vec_var_usage *usage = + rzalloc_size(mem_ctx, sizeof(*usage) + + num_levels * sizeof(usage->levels[0])); + + usage->num_levels = num_levels; + const struct glsl_type *type = var->type; + for (unsigned i = 0; i < num_levels; i++) { + usage->levels[i].array_len = glsl_get_length(type); + type = glsl_get_array_element(type); + } + assert(glsl_type_is_vector_or_scalar(type)); + + usage->all_comps = (1 << glsl_get_components(type)) - 1; + + _mesa_hash_table_insert(var_usage_map, var, usage); + + return usage; +} + +static struct vec_var_usage * +get_vec_deref_usage(nir_deref_instr *deref, + struct hash_table *var_usage_map, + nir_variable_mode modes, + bool add_usage_entry, void *mem_ctx) +{ + if (!(deref->mode & modes)) + return NULL; + + return get_vec_var_usage(nir_deref_instr_get_variable(deref), + var_usage_map, add_usage_entry, mem_ctx); +} + +static void +mark_deref_used(nir_deref_instr *deref, + nir_component_mask_t comps_read, + nir_component_mask_t comps_written, + nir_deref_instr *copy_deref, + struct hash_table *var_usage_map, + nir_variable_mode modes, + void *mem_ctx) +{ + if (!(deref->mode & modes)) + return; + + nir_variable *var = nir_deref_instr_get_variable(deref); + + struct vec_var_usage *usage = + get_vec_var_usage(var, var_usage_map, true, mem_ctx); + if (!usage) + return; + + usage->comps_read |= comps_read & usage->all_comps; + usage->comps_written |= comps_written & usage->all_comps; + + struct vec_var_usage *copy_usage = NULL; + if (copy_deref) { + copy_usage = get_vec_deref_usage(copy_deref, var_usage_map, modes, + true, mem_ctx); + if (copy_usage) { + if (usage->vars_copied == NULL) { + usage->vars_copied = _mesa_set_create(mem_ctx, _mesa_hash_pointer, + _mesa_key_pointer_equal); + } + _mesa_set_add(usage->vars_copied, copy_usage); + } else { + usage->has_external_copy = true; + } + } + + nir_deref_path path; + nir_deref_path_init(&path, deref, mem_ctx); + + nir_deref_path copy_path; + if (copy_usage) + nir_deref_path_init(©_path, copy_deref, mem_ctx); + + unsigned copy_i = 0; + for (unsigned i = 0; i < usage->num_levels; i++) { + struct array_level_usage *level = &usage->levels[i]; + nir_deref_instr *deref = path.path[i + 1]; + assert(deref->deref_type == nir_deref_type_array || + deref->deref_type == nir_deref_type_array_wildcard); + + unsigned max_used; + if (deref->deref_type == nir_deref_type_array) { + nir_const_value *const_index = + nir_src_as_const_value(deref->arr.index); + max_used = const_index ? const_index->u32[0] : UINT_MAX; + } else { + /* For wildcards, we read or wrote the whole thing. */ + assert(deref->deref_type == nir_deref_type_array_wildcard); + max_used = level->array_len - 1; + + if (copy_usage) { + /* Match each wildcard level with the level on copy_usage */ + for (; copy_path.path[copy_i + 1]; copy_i++) { + if (copy_path.path[copy_i + 1]->deref_type == + nir_deref_type_array_wildcard) + break; + } + struct array_level_usage *copy_level = + ©_usage->levels[copy_i++]; + + if (level->levels_copied == NULL) { + level->levels_copied = + _mesa_set_create(mem_ctx, _mesa_hash_pointer, + _mesa_key_pointer_equal); + } + _mesa_set_add(level->levels_copied, copy_level); + } else { + /* We have a wildcard and it comes from a variable we aren't + * tracking; flag it and we'll know to not shorten this array. + */ + level->has_external_copy = true; + } + } + + if (comps_written) + level->max_written = MAX2(level->max_written, max_used); + if (comps_read) + level->max_read = MAX2(level->max_read, max_used); + } +} + +static bool +src_is_load_deref(nir_src src, nir_src deref_src) +{ + assert(src.is_ssa); + assert(deref_src.is_ssa); + + if (src.ssa->parent_instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *load = nir_instr_as_intrinsic(src.ssa->parent_instr); + if (load->intrinsic != nir_intrinsic_load_deref) + return false; + + assert(load->src[0].is_ssa); + + return load->src[0].ssa == deref_src.ssa; +} + +/* Returns all non-self-referential components of a store instruction. A + * component is self-referential if it comes from the same component of a load + * instruction on the same deref. If the only data in a particular component + * of a variable came directly from that component then it's undefined. The + * only way to get defined data into a component of a variable is for it to + * get written there by something outside or from a different component. + * + * This is a fairly common pattern in shaders that come from either GLSL IR or + * GLSLang because both glsl_to_nir and GLSLang implement write-masking with + * load-vec-store. + */ +static nir_component_mask_t +get_non_self_referential_store_comps(nir_intrinsic_instr *store) +{ + nir_component_mask_t comps = nir_intrinsic_write_mask(store); + + assert(store->src[1].is_ssa); + nir_instr *src_instr = store->src[1].ssa->parent_instr; + if (src_instr->type != nir_instr_type_alu) + return comps; + + nir_alu_instr *src_alu = nir_instr_as_alu(src_instr); + + if (src_alu->op == nir_op_imov || + src_alu->op == nir_op_fmov) { + /* If it's just a swizzle of a load from the same deref, discount any + * channels that don't move in the swizzle. + */ + if (src_is_load_deref(src_alu->src[0].src, store->src[0])) { + for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { + if (src_alu->src[0].swizzle[i] == i) + comps &= ~(1u << i); + } + } + } else if (src_alu->op == nir_op_vec2 || + src_alu->op == nir_op_vec3 || + src_alu->op == nir_op_vec4) { + /* If it's a vec, discount any channels that are just loads from the + * same deref put in the same spot. + */ + for (unsigned i = 0; i < nir_op_infos[src_alu->op].num_inputs; i++) { + if (src_is_load_deref(src_alu->src[i].src, store->src[0]) && + src_alu->src[i].swizzle[0] == i) + comps &= ~(1u << i); + } + } + + return comps; +} + +static void +find_used_components_impl(nir_function_impl *impl, + struct hash_table *var_usage_map, + nir_variable_mode modes, + void *mem_ctx) +{ + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_deref: + mark_deref_used(nir_src_as_deref(intrin->src[0]), + nir_ssa_def_components_read(&intrin->dest.ssa), 0, + NULL, var_usage_map, modes, mem_ctx); + break; + + case nir_intrinsic_store_deref: + mark_deref_used(nir_src_as_deref(intrin->src[0]), + 0, get_non_self_referential_store_comps(intrin), + NULL, var_usage_map, modes, mem_ctx); + break; + + case nir_intrinsic_copy_deref: { + /* Just mark everything used for copies. */ + nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]); + nir_deref_instr *src = nir_src_as_deref(intrin->src[1]); + mark_deref_used(dst, 0, ~0, src, var_usage_map, modes, mem_ctx); + mark_deref_used(src, ~0, 0, dst, var_usage_map, modes, mem_ctx); + break; + } + + default: + break; + } + } + } +} + +static bool +shrink_vec_var_list(struct exec_list *vars, + struct hash_table *var_usage_map) +{ + /* Initialize the components kept field of each variable. This is the + * AND of the components written and components read. If a component is + * written but never read, it's dead. If it is read but never written, + * then all values read are undefined garbage and we may as well not read + * them. + * + * The same logic applies to the array length. We make the array length + * the minimum needed required length between read and write and plan to + * discard any OOB access. The one exception here is indirect writes + * because we don't know where they will land and we can't shrink an array + * with indirect writes because previously in-bounds writes may become + * out-of-bounds and have undefined behavior. + * + * Also, if we have a copy that to/from something we can't shrink, we need + * to leave components and array_len of any wildcards alone. + */ + nir_foreach_variable(var, vars) { + struct vec_var_usage *usage = + get_vec_var_usage(var, var_usage_map, false, NULL); + if (!usage) + continue; + + assert(usage->comps_kept == 0); + if (usage->has_external_copy) + usage->comps_kept = usage->all_comps; + else + usage->comps_kept = usage->comps_read & usage->comps_written; + + for (unsigned i = 0; i < usage->num_levels; i++) { + struct array_level_usage *level = &usage->levels[i]; + assert(level->array_len > 0); + + if (level->max_written == UINT_MAX || level->has_external_copy) + continue; /* Can't shrink */ + + unsigned max_used = MIN2(level->max_read, level->max_written); + level->array_len = MIN2(max_used, level->array_len - 1) + 1; + } + } + + /* In order for variable copies to work, we have to have the same data type + * on the source and the destination. In order to satisfy this, we run a + * little fixed-point algorithm to transitively ensure that we get enough + * components and array elements for this to hold for all copies. + */ + bool fp_progress; + do { + fp_progress = false; + nir_foreach_variable(var, vars) { + struct vec_var_usage *var_usage = + get_vec_var_usage(var, var_usage_map, false, NULL); + if (!var_usage || !var_usage->vars_copied) + continue; + + struct set_entry *copy_entry; + set_foreach(var_usage->vars_copied, copy_entry) { + struct vec_var_usage *copy_usage = (void *)copy_entry->key; + if (copy_usage->comps_kept != var_usage->comps_kept) { + nir_component_mask_t comps_kept = + (var_usage->comps_kept | copy_usage->comps_kept); + var_usage->comps_kept = comps_kept; + copy_usage->comps_kept = comps_kept; + fp_progress = true; + } + } + + for (unsigned i = 0; i < var_usage->num_levels; i++) { + struct array_level_usage *var_level = &var_usage->levels[i]; + if (!var_level->levels_copied) + continue; + + set_foreach(var_level->levels_copied, copy_entry) { + struct array_level_usage *copy_level = (void *)copy_entry->key; + if (var_level->array_len != copy_level->array_len) { + unsigned array_len = + MAX2(var_level->array_len, copy_level->array_len); + var_level->array_len = array_len; + copy_level->array_len = array_len; + fp_progress = true; + } + } + } + } + } while (fp_progress); + + bool vars_shrunk = false; + nir_foreach_variable_safe(var, vars) { + struct vec_var_usage *usage = + get_vec_var_usage(var, var_usage_map, false, NULL); + if (!usage) + continue; + + bool shrunk = false; + const struct glsl_type *vec_type = var->type; + for (unsigned i = 0; i < usage->num_levels; i++) { + /* If we've reduced the array to zero elements at some level, just + * set comps_kept to 0 and delete the variable. + */ + if (usage->levels[i].array_len == 0) { + usage->comps_kept = 0; + break; + } + + assert(usage->levels[i].array_len <= glsl_get_length(vec_type)); + if (usage->levels[i].array_len < glsl_get_length(vec_type)) + shrunk = true; + vec_type = glsl_get_array_element(vec_type); + } + assert(glsl_type_is_vector_or_scalar(vec_type)); + + assert(usage->comps_kept == (usage->comps_kept & usage->all_comps)); + if (usage->comps_kept != usage->all_comps) + shrunk = true; + + if (usage->comps_kept == 0) { + /* This variable is dead, remove it */ + vars_shrunk = true; + exec_node_remove(&var->node); + continue; + } + + if (!shrunk) { + /* This variable doesn't need to be shrunk. Remove it from the + * hash table so later steps will ignore it. + */ + _mesa_hash_table_remove_key(var_usage_map, var); + continue; + } + + /* Build the new var type */ + unsigned new_num_comps = _mesa_bitcount(usage->comps_kept); + const struct glsl_type *new_type = + glsl_vector_type(glsl_get_base_type(vec_type), new_num_comps); + for (int i = usage->num_levels - 1; i >= 0; i--) { + assert(usage->levels[i].array_len > 0); + /* If the original type was a matrix type, we'd like to keep that so + * we don't convert matrices into arrays. + */ + if (i == usage->num_levels - 1 && + glsl_type_is_matrix(glsl_without_array(var->type)) && + new_num_comps > 1 && usage->levels[i].array_len > 1) { + new_type = glsl_matrix_type(glsl_get_base_type(new_type), + new_num_comps, + usage->levels[i].array_len); + } else { + new_type = glsl_array_type(new_type, usage->levels[i].array_len); + } + } + var->type = new_type; + + vars_shrunk = true; + } + + return vars_shrunk; +} + +static bool +vec_deref_is_oob(nir_deref_instr *deref, + struct vec_var_usage *usage) +{ + nir_deref_path path; + nir_deref_path_init(&path, deref, NULL); + + bool oob = false; + for (unsigned i = 0; i < usage->num_levels; i++) { + nir_deref_instr *p = path.path[i + 1]; + if (p->deref_type == nir_deref_type_array_wildcard) + continue; + + nir_const_value *const_index = nir_src_as_const_value(p->arr.index); + if (const_index && const_index->u32[0] >= usage->levels[i].array_len) { + oob = true; + break; + } + } + + nir_deref_path_finish(&path); + + return oob; +} + +static bool +vec_deref_is_dead_or_oob(nir_deref_instr *deref, + struct hash_table *var_usage_map, + nir_variable_mode modes) +{ + struct vec_var_usage *usage = + get_vec_deref_usage(deref, var_usage_map, modes, false, NULL); + if (!usage) + return false; + + return usage->comps_kept == 0 || vec_deref_is_oob(deref, usage); +} + +static void +shrink_vec_var_access_impl(nir_function_impl *impl, + struct hash_table *var_usage_map, + nir_variable_mode modes) +{ + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + switch (instr->type) { + case nir_instr_type_deref: { + nir_deref_instr *deref = nir_instr_as_deref(instr); + if (!(deref->mode & modes)) + break; + + /* Clean up any dead derefs we find lying around. They may refer + * to variables we've deleted. + */ + if (nir_deref_instr_remove_if_unused(deref)) + break; + + /* Update the type in the deref to keep the types consistent as + * you walk down the chain. We don't need to check if this is one + * of the derefs we're shrinking because this is a no-op if it + * isn't. The worst that could happen is that we accidentally fix + * an invalid deref. + */ + if (deref->deref_type == nir_deref_type_var) { + deref->type = deref->var->type; + } else if (deref->deref_type == nir_deref_type_array || + deref->deref_type == nir_deref_type_array_wildcard) { + nir_deref_instr *parent = nir_deref_instr_parent(deref); + assert(glsl_type_is_array(parent->type) || + glsl_type_is_matrix(parent->type)); + deref->type = glsl_get_array_element(parent->type); + } + break; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + /* If we have a copy whose source or destination has been deleted + * because we determined the variable was dead, then we just + * delete the copy instruction. If the source variable was dead + * then it was writing undefined garbage anyway and if it's the + * destination variable that's dead then the write isn't needed. + */ + if (intrin->intrinsic == nir_intrinsic_copy_deref) { + nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]); + nir_deref_instr *src = nir_src_as_deref(intrin->src[1]); + if (vec_deref_is_dead_or_oob(dst, var_usage_map, modes) || + vec_deref_is_dead_or_oob(src, var_usage_map, modes)) { + nir_instr_remove(&intrin->instr); + nir_deref_instr_remove_if_unused(dst); + nir_deref_instr_remove_if_unused(src); + } + continue; + } + + if (intrin->intrinsic != nir_intrinsic_load_deref && + intrin->intrinsic != nir_intrinsic_store_deref) + continue; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + if (!(deref->mode & modes)) + continue; + + struct vec_var_usage *usage = + get_vec_deref_usage(deref, var_usage_map, modes, false, NULL); + if (!usage) + continue; + + if (usage->comps_kept == 0 || vec_deref_is_oob(deref, usage)) { + if (intrin->intrinsic == nir_intrinsic_load_deref) { + nir_ssa_def *u = + nir_ssa_undef(&b, intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_src_for_ssa(u)); + } + nir_instr_remove(&intrin->instr); + nir_deref_instr_remove_if_unused(deref); + continue; + } + + if (intrin->intrinsic == nir_intrinsic_load_deref) { + b.cursor = nir_after_instr(&intrin->instr); + + nir_ssa_def *undef = + nir_ssa_undef(&b, 1, intrin->dest.ssa.bit_size); + nir_ssa_def *vec_srcs[NIR_MAX_VEC_COMPONENTS]; + unsigned c = 0; + for (unsigned i = 0; i < intrin->num_components; i++) { + if (usage->comps_kept & (1u << i)) + vec_srcs[i] = nir_channel(&b, &intrin->dest.ssa, c++); + else + vec_srcs[i] = undef; + } + nir_ssa_def *vec = nir_vec(&b, vec_srcs, intrin->num_components); + + nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, + nir_src_for_ssa(vec), + vec->parent_instr); + + /* The SSA def is now only used by the swizzle. It's safe to + * shrink the number of components. + */ + assert(list_length(&intrin->dest.ssa.uses) == c); + intrin->num_components = c; + intrin->dest.ssa.num_components = c; + } else { + nir_component_mask_t write_mask = + nir_intrinsic_write_mask(intrin); + + unsigned swizzle[NIR_MAX_VEC_COMPONENTS]; + nir_component_mask_t new_write_mask = 0; + unsigned c = 0; + for (unsigned i = 0; i < intrin->num_components; i++) { + if (usage->comps_kept & (1u << i)) { + swizzle[c] = i; + if (write_mask & (1u << i)) + new_write_mask |= 1u << c; + c++; + } + } + + b.cursor = nir_before_instr(&intrin->instr); + + nir_ssa_def *swizzled = + nir_swizzle(&b, intrin->src[1].ssa, swizzle, c, false); + + /* Rewrite to use the compacted source */ + nir_instr_rewrite_src(&intrin->instr, &intrin->src[1], + nir_src_for_ssa(swizzled)); + nir_intrinsic_set_write_mask(intrin, new_write_mask); + intrin->num_components = c; + } + break; + } + + default: + break; + } + } + } +} + +static bool +function_impl_has_vars_with_modes(nir_function_impl *impl, + nir_variable_mode modes) +{ + nir_shader *shader = impl->function->shader; + + if ((modes & nir_var_global) && !exec_list_is_empty(&shader->globals)) + return true; + + if ((modes & nir_var_local) && !exec_list_is_empty(&impl->locals)) + return true; + + return false; +} + +/** Attempt to shrink arrays of vectors + * + * This pass looks at variables which contain a vector or an array (possibly + * multiple dimensions) of vectors and attempts to lower to a smaller vector + * or array. If the pass can prove that a component of a vector (or array of + * vectors) is never really used, then that component will be removed. + * Similarly, the pass attempts to shorten arrays based on what elements it + * can prove are never read or never contain valid data. + */ +bool +nir_shrink_vec_array_vars(nir_shader *shader, nir_variable_mode modes) +{ + assert((modes & (nir_var_global | nir_var_local)) == modes); + + void *mem_ctx = ralloc_context(NULL); + + struct hash_table *var_usage_map = + _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + bool has_vars_to_shrink = false; + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + /* Don't even bother crawling the IR if we don't have any variables. + * Given that this pass deletes any unused variables, it's likely that + * we will be in this scenario eventually. + */ + if (function_impl_has_vars_with_modes(function->impl, modes)) { + has_vars_to_shrink = true; + find_used_components_impl(function->impl, var_usage_map, + modes, mem_ctx); + } + } + if (!has_vars_to_shrink) { + ralloc_free(mem_ctx); + return false; + } + + bool globals_shrunk = false; + if (modes & nir_var_global) + globals_shrunk = shrink_vec_var_list(&shader->globals, var_usage_map); + + bool progress = false; + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + bool locals_shrunk = false; + if (modes & nir_var_local) { + locals_shrunk = shrink_vec_var_list(&function->impl->locals, + var_usage_map); + } + + if (globals_shrunk || locals_shrunk) { + shrink_vec_var_access_impl(function->impl, var_usage_map, modes); + + nir_metadata_preserve(function->impl, nir_metadata_block_index | + nir_metadata_dominance); + progress = true; + } + } + + ralloc_free(mem_ctx); + + return progress; +}