From: Connor Abbott Date: Wed, 28 Aug 2019 14:56:57 +0000 (+0200) Subject: nir/opt_if: Fix undef handling in opt_split_alu_of_phi() X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=57e0bb8ccc81ece56fd3081cb9a282b088124c6e;p=mesa.git nir/opt_if: Fix undef handling in opt_split_alu_of_phi() The pass assumed that "Most ALU ops produce an undefined result if any source is undef" which is completely untrue. Due to how we lower if statements to selects and then optimize on those selects later, we simply cannot make that assumption. In particular this pass tried to replace an ior of undef and true, which had been generated by optimizing a select which itself came from flattening an if statement, to undef causing a miscompilation for a CTS test with radeonsi NIR. We fix this by always doing what the non-undef path did, i.e. duplicate the instruction twice. If there are cases where the instruction before the loop can be folded away due to having an undef source, we should add these to opt_undef instead. The comment above the pass says that if the phi source from before the loop is undef, and we can fold the instruction before the loop to undef, then we can ignore sources of the original instruction that don't dominate the block before the loop because we don't need them to create the instruction before the loop. This is incorrect, because the instruction at the bottom of the loop would get those sources from the wrong loop iteration. The code never actually did what the comment said, so we only have to update the comment to match what the pass actually does. We also update the example to more closely match what most actual loops look like after vtn and peephole_select. There are no shader-db changes with i965, radeonsi NIR, or radv. With anv and my vkpipeline-db there's only one change: total instructions in shared programs: 14125290 -> 14125300 (<.01%) instructions in affected programs: 2598 -> 2608 (0.38%) helped: 0 HURT: 1 total cycles in shared programs: 2051473437 -> 2051473397 (<.01%) cycles in affected programs: 36697 -> 36657 (-0.11%) helped: 1 HURT: 0 Fixes KHR-GL45.shader_subroutine.control_flow_and_returned_subroutine_values_used_as_subroutine_input with radeonsi NIR. --- diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c index e360c274bc9..4d391325050 100644 --- a/src/compiler/nir/nir_opt_if.c +++ b/src/compiler/nir/nir_opt_if.c @@ -309,35 +309,29 @@ alu_instr_is_type_conversion(const nir_alu_instr *alu) * * - At least one source of the instruction is a phi node from the header block. * - * and either this rule + * - The phi node selects a constant or undef from the block before the loop. * - * - The phi node selects undef from the block before the loop and a value - * from the continue block of the loop. - * - * or these two rules - * - * - The phi node selects a constant from the block before the loop. - * - * - The non-phi source of the ALU instruction comes from a block that + * - Any non-phi sources of the ALU instruction come from a block that * dominates the block before the loop. The most common failure mode for * this check is sources that are generated in the loop header block. * - * The split process moves the original ALU instruction to the bottom of the - * loop. The phi node source is replaced with the value from the phi node - * selected from the continue block (i.e., the non-undef value). A new phi - * node is added to the header block that selects either undef from the block - * before the loop or the result of the (moved) ALU instruction. + * The split process splits the original ALU instruction into two, one at the + * bottom of the loop and one at the block before the loop. The instruction + * before the loop computes the value on the first iteration, and the + * instruction at the bottom computes the value on the second, third, and so + * on. A new phi node is added to the header block that selects either the + * instruction before the loop or the one at the end, and uses of the original + * instruction are replaced by this phi. * * The splitting transforms a loop like: * - * vec1 32 ssa_7 = undefined * vec1 32 ssa_8 = load_const (0x00000001) * vec1 32 ssa_10 = load_const (0x00000000) * // succs: block_1 * loop { * block block_1: * // preds: block_0 block_4 - * vec1 32 ssa_11 = phi block_0: ssa_7, block_4: ssa_15 + * vec1 32 ssa_11 = phi block_0: ssa_10, block_4: ssa_15 * vec1 32 ssa_12 = phi block_0: ssa_1, block_4: ssa_15 * vec1 32 ssa_13 = phi block_0: ssa_10, block_4: ssa_16 * vec1 32 ssa_14 = iadd ssa_11, ssa_8 @@ -348,27 +342,22 @@ alu_instr_is_type_conversion(const nir_alu_instr *alu) * * into: * - * vec1 32 ssa_7 = undefined * vec1 32 ssa_8 = load_const (0x00000001) * vec1 32 ssa_10 = load_const (0x00000000) + * vec1 32 ssa_22 = iadd ssa_10, ssa_8 * // succs: block_1 * loop { * block block_1: * // preds: block_0 block_4 - * vec1 32 ssa_11 = phi block_0: ssa_7, block_4: ssa_15 + * vec1 32 ssa_11 = phi block_0: ssa_10, block_4: ssa_15 * vec1 32 ssa_12 = phi block_0: ssa_1, block_4: ssa_15 * vec1 32 ssa_13 = phi block_0: ssa_10, block_4: ssa_16 - * vec1 32 ssa_21 = phi block_0: sss_7, block_4: ssa_20 + * vec1 32 ssa_21 = phi block_0: ssa_22, block_4: ssa_20 * vec1 32 ssa_15 = b32csel ssa_13, ssa_21, ssa_12 * ... * vec1 32 ssa_20 = iadd ssa_15, ssa_8 * // succs: block_1 * } - * - * If the phi does not select an undef, the instruction is duplicated in the - * loop continue block (as in the undef case) and in the previous block. When - * the ALU instruction is duplicated in the previous block, the correct source - * must be selected from the phi node. */ static bool opt_split_alu_of_phi(nir_builder *b, nir_loop *loop) @@ -394,19 +383,12 @@ opt_split_alu_of_phi(nir_builder *b, nir_loop *loop) nir_alu_instr *const alu = nir_instr_as_alu(instr); - /* Most ALU ops produce an undefined result if any source is undef. - * However, operations like bcsel only produce undefined results of the - * first operand is undef. Even in the undefined case, the result - * should be one of the other two operands, so the result of the bcsel - * should never be replaced with undef. - * - * nir_op_vec{2,3,4} and nir_op_mov are excluded because they can easily - * lead to infinite optimization loops. + /* nir_op_vec{2,3,4} and nir_op_mov are excluded because they can easily + * lead to infinite optimization loops. Splitting comparisons can lead + * to loop unrolling not recognizing loop termintators, and type + * conversions also lead to regressions. */ - if (alu->op == nir_op_bcsel || - alu->op == nir_op_b32csel || - alu->op == nir_op_fcsel || - alu->op == nir_op_vec2 || + if (alu->op == nir_op_vec2 || alu->op == nir_op_vec3 || alu->op == nir_op_vec4 || alu->op == nir_op_mov || @@ -477,26 +459,9 @@ opt_split_alu_of_phi(nir_builder *b, nir_loop *loop) if (has_phi_src_from_prev_block && all_non_phi_exist_in_prev_block && (is_prev_result_undef || is_prev_result_const)) { nir_block *const continue_block = find_continue_block(loop); - nir_ssa_def *prev_value; - if (!is_prev_result_undef) { - b->cursor = nir_after_block(prev_block); - prev_value = clone_alu_and_replace_src_defs(b, alu, prev_srcs); - } else { - /* Since the undef used as the source of the original ALU - * instruction may have different number of components or - * bit size than the result of that instruction, a new - * undef must be created. - */ - nir_ssa_undef_instr *undef = - nir_ssa_undef_instr_create(b->shader, - alu->dest.dest.ssa.num_components, - alu->dest.dest.ssa.bit_size); - - nir_instr_insert_after_block(prev_block, &undef->instr); - - prev_value = &undef->def; - } + b->cursor = nir_after_block(prev_block); + nir_ssa_def *prev_value = clone_alu_and_replace_src_defs(b, alu, prev_srcs); /* Make a copy of the original ALU instruction. Replace the sources * of the new instruction that read a phi with an undef source from