From: Rhys Perry Date: Thu, 23 Apr 2020 15:13:10 +0000 (+0100) Subject: nir/load_store_vectorize: rework alignment calculation X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=91a8a07a5f0ffdd9eb85634fc9c0cb13812210b2 nir/load_store_vectorize: rework alignment calculation It now also updates align_offset and creates better alignment information with a constant 0 offset. shader-db (Navi): Totals from 63 (0.05% of 127638) affected shaders: SGPRs: 3072 -> 3064 (-0.26%) VGPRs: 2736 -> 2740 (+0.15%) CodeSize: 325180 -> 324336 (-0.26%); split: -0.27%, +0.01% Instrs: 63555 -> 63413 (-0.22%); split: -0.24%, +0.02% Signed-off-by: Rhys Perry Reviewed-by: Eric Anholt Part-of: --- diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 224323532ac..147b88c3594 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -173,7 +173,8 @@ struct entry { uint64_t offset; /* sign-extended */ int64_t offset_signed; }; - uint32_t best_align; + uint32_t align_mul; + uint32_t align_offset; nir_instr *instr; nir_intrinsic_instr *intrin; @@ -537,6 +538,25 @@ aliasing_modes(nir_variable_mode modes) return modes; } +static void +calc_alignment(struct entry *entry) +{ + uint32_t align_mul = 31; + for (unsigned i = 0; i < entry->key->offset_def_count; i++) { + if (entry->key->offset_defs_mul[i]) + align_mul = MIN2(align_mul, ffsll(entry->key->offset_defs_mul[i])); + } + + entry->align_mul = 1u << (align_mul - 1); + bool has_align = nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL]; + if (!has_align || entry->align_mul >= nir_intrinsic_align_mul(entry->intrin)) { + entry->align_offset = entry->offset % entry->align_mul; + } else { + entry->align_mul = nir_intrinsic_align_mul(entry->intrin); + entry->align_offset = nir_intrinsic_align_offset(entry->intrin); + } +} + static struct entry * create_entry(struct vectorize_ctx *ctx, const struct intrinsic_info *info, @@ -546,7 +566,6 @@ create_entry(struct vectorize_ctx *ctx, entry->intrin = intrin; entry->instr = &intrin->instr; entry->info = info; - entry->best_align = UINT32_MAX; entry->is_store = entry->info->value_src >= 0; if (entry->info->deref_src >= 0) { @@ -583,6 +602,8 @@ create_entry(struct vectorize_ctx *ctx, if (get_variable_mode(entry) & restrict_modes) entry->access |= ACCESS_RESTRICT; + calc_alignment(entry); + return entry; } @@ -623,40 +644,6 @@ writemask_representable(unsigned write_mask, unsigned old_bit_size, unsigned new return true; } -static uint64_t -gcd(uint64_t a, uint64_t b) -{ - while (b) { - uint64_t old_b = b; - b = a % b; - a = old_b; - } - return a; -} - -static uint32_t -get_best_align(struct entry *entry) -{ - if (entry->best_align != UINT32_MAX) - return entry->best_align; - - uint64_t best_align = entry->offset; - for (unsigned i = 0; i < entry->key->offset_def_count; i++) { - if (!best_align) - best_align = entry->key->offset_defs_mul[i]; - else if (entry->key->offset_defs_mul[i]) - best_align = gcd(best_align, entry->key->offset_defs_mul[i]); - } - - if (nir_intrinsic_has_align_mul(entry->intrin)) - best_align = MAX2(best_align, nir_intrinsic_align(entry->intrin)); - - /* ensure the result is a power of two that fits in a int32_t */ - entry->best_align = gcd(best_align, 1u << 30); - - return entry->best_align; -} - /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store * of "low" and "high". */ static bool @@ -680,7 +667,8 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size, if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS) return false; - if (!ctx->callback(get_best_align(low), new_bit_size, new_num_components, + uint32_t align = low->align_offset ? 1 << (ffs(low->align_offset) - 1) : low->align_mul; + if (!ctx->callback(align, new_bit_size, new_num_components, high_offset, low->intrin, high->intrin)) return false; @@ -749,18 +737,6 @@ static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, i b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size)); } -static bool update_align(struct entry *entry) -{ - if (nir_intrinsic_has_align_mul(entry->intrin)) { - unsigned align = get_best_align(entry); - if (align != nir_intrinsic_align(entry->intrin)) { - nir_intrinsic_set_align(entry->intrin, align, 0); - return true; - } - } - return false; -} - static void vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx, struct entry *low, struct entry *high, @@ -838,9 +814,9 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx, first->key = low->key; first->offset = low->offset; - first->best_align = get_best_align(low); - update_align(first); + first->align_mul = low->align_mul; + first->align_offset = low->align_offset; nir_instr_remove(second->instr); } @@ -920,9 +896,9 @@ vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx, second->key = low->key; second->offset = low->offset; - second->best_align = get_best_align(low); - update_align(second); + second->align_mul = low->align_mul; + second->align_offset = low->align_offset; list_del(&first->head); nir_instr_remove(first->instr); @@ -1130,6 +1106,18 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, return true; } +static bool +update_align(struct entry *entry) +{ + if (nir_intrinsic_has_align_mul(entry->intrin) && + (entry->align_mul != nir_intrinsic_align_mul(entry->intrin) || + entry->align_offset != nir_intrinsic_align_offset(entry->intrin))) { + nir_intrinsic_set_align(entry->intrin, entry->align_mul, entry->align_offset); + return true; + } + return false; +} + static bool vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht) { @@ -1152,10 +1140,8 @@ vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct has struct entry *high = *util_dynarray_element(arr, struct entry *, i + 1); uint64_t diff = high->offset_signed - low->offset_signed; - if (diff > get_bit_size(low) / 8u * low->intrin->num_components) { - progress |= update_align(low); + if (diff > get_bit_size(low) / 8u * low->intrin->num_components) continue; - } struct entry *first = low->index < high->index ? low : high; struct entry *second = low->index < high->index ? high : low; @@ -1164,13 +1150,13 @@ vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct has *util_dynarray_element(arr, struct entry *, i) = NULL; *util_dynarray_element(arr, struct entry *, i + 1) = low->is_store ? second : first; progress = true; - } else { - progress |= update_align(low); } } - struct entry *last = *util_dynarray_element(arr, struct entry *, i); - progress |= update_align(last); + util_dynarray_foreach(arr, struct entry *, elem) { + if (*elem) + progress |= update_align(*elem); + } } _mesa_hash_table_clear(ht, delete_entry_dynarray); diff --git a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp index 37664387737..e5ee999d7e4 100644 --- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp +++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp @@ -605,7 +605,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_load_identical_store_intersecting) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); } @@ -619,7 +619,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_load_identical_store_identical) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); } @@ -633,7 +633,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_store_identical_load_identical) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); } @@ -776,7 +776,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_memory_barrier) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); } @@ -1043,7 +1043,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_store_adjacent_32_64) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); } @@ -1521,7 +1521,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_alias0) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); } @@ -1619,7 +1619,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_alias5) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); } @@ -1743,7 +1743,7 @@ TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust) nir_validate_shader(b->shader, NULL); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); - EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo)); + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo)); ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); }