2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "nir_instr_set.h"
25 #include "nir_search_helpers.h"
26 #include "nir_builder.h"
27 #include "util/u_vector.h"
29 /* Partial redundancy elimination of compares
31 * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
32 * instructions like 'b - a'. The comparison is replaced by the arithmetic
33 * instruction, and the result is compared with zero. For example,
35 * vec1 32 ssa_111 = flt 0.37, ssa_110.w
38 * vec1 32 ssa_112 = fadd ssa_110.w, -0.37
43 * vec1 32 ssa_111 = fadd ssa_110.w, -0.37
44 * vec1 32 ssa_112 = flt 0.0, ssa_111
52 * Stack of blocks from the current location in the CFG to the entry point
55 * This is sort of a poor man's dominator tree.
57 struct exec_list blocks
;
59 /** List of freed block_instructions structures that can be reused. */
60 struct exec_list reusable_blocks
;
63 struct block_instructions
{
64 struct exec_node node
;
67 * Set of comparison instructions from the block that are candidates for
68 * being replaced by add instructions.
70 struct u_vector instructions
;
74 block_queue_init(struct block_queue
*bq
)
76 exec_list_make_empty(&bq
->blocks
);
77 exec_list_make_empty(&bq
->reusable_blocks
);
81 block_queue_finish(struct block_queue
*bq
)
83 struct block_instructions
*n
;
85 while ((n
= (struct block_instructions
*) exec_list_pop_head(&bq
->blocks
)) != NULL
) {
86 u_vector_finish(&n
->instructions
);
90 while ((n
= (struct block_instructions
*) exec_list_pop_head(&bq
->reusable_blocks
)) != NULL
) {
95 static struct block_instructions
*
96 push_block(struct block_queue
*bq
)
98 struct block_instructions
*bi
=
99 (struct block_instructions
*) exec_list_pop_head(&bq
->reusable_blocks
);
102 bi
= calloc(1, sizeof(struct block_instructions
));
108 if (!u_vector_init(&bi
->instructions
,
109 sizeof(struct nir_alu_instr
*),
110 8 * sizeof(struct nir_alu_instr
*)))
113 exec_list_push_tail(&bq
->blocks
, &bi
->node
);
119 pop_block(struct block_queue
*bq
, struct block_instructions
*bi
)
121 u_vector_finish(&bi
->instructions
);
122 exec_node_remove(&bi
->node
);
123 exec_list_push_head(&bq
->reusable_blocks
, &bi
->node
);
127 add_instruction_for_block(struct block_instructions
*bi
,
128 struct nir_alu_instr
*alu
)
130 struct nir_alu_instr
**data
=
131 u_vector_add(&bi
->instructions
);
137 rewrite_compare_instruction(nir_builder
*bld
, nir_alu_instr
*orig_cmp
,
138 nir_alu_instr
*orig_add
, bool zero_on_left
)
140 void *const mem_ctx
= ralloc_parent(orig_cmp
);
142 bld
->cursor
= nir_before_instr(&orig_cmp
->instr
);
144 /* This is somewhat tricky. The compare instruction may be something like
145 * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
146 * b). This is problematic because the SSA value for the fneg(a) may not
147 * exist yet at the compare instruction.
149 * We fabricate the operands of the new add. This is done using
150 * information provided by zero_on_left. If zero_on_left is true, we know
151 * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)). If the
152 * original compare instruction was (fcmp, a, b), x = b and y = -a. If
153 * zero_on_left is false, the resulting compare instruction is (fcmp,
154 * (fadd, x, y), 0.0) and x = a and y = -b.
156 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, orig_cmp
, 0);
157 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, orig_cmp
, 1);
159 nir_ssa_def
*const fadd
= zero_on_left
160 ? nir_fadd(bld
, b
, nir_fneg(bld
, a
))
161 : nir_fadd(bld
, a
, nir_fneg(bld
, b
));
163 nir_ssa_def
*const zero
=
164 nir_imm_floatN_t(bld
, 0.0, orig_add
->dest
.dest
.ssa
.bit_size
);
166 nir_ssa_def
*const cmp
= zero_on_left
167 ? nir_build_alu(bld
, orig_cmp
->op
, zero
, fadd
, NULL
, NULL
)
168 : nir_build_alu(bld
, orig_cmp
->op
, fadd
, zero
, NULL
, NULL
);
170 /* Generating extra moves of the results is the easy way to make sure the
171 * writemasks match the original instructions. Later optimization passes
172 * will clean these up. This is similar to nir_replace_instr (in
175 nir_alu_instr
*mov_add
= nir_alu_instr_create(mem_ctx
, nir_op_imov
);
176 mov_add
->dest
.write_mask
= orig_add
->dest
.write_mask
;
177 nir_ssa_dest_init(&mov_add
->instr
, &mov_add
->dest
.dest
,
178 orig_add
->dest
.dest
.ssa
.num_components
,
179 orig_add
->dest
.dest
.ssa
.bit_size
, NULL
);
180 mov_add
->src
[0].src
= nir_src_for_ssa(fadd
);
182 nir_builder_instr_insert(bld
, &mov_add
->instr
);
184 nir_alu_instr
*mov_cmp
= nir_alu_instr_create(mem_ctx
, nir_op_imov
);
185 mov_cmp
->dest
.write_mask
= orig_cmp
->dest
.write_mask
;
186 nir_ssa_dest_init(&mov_cmp
->instr
, &mov_cmp
->dest
.dest
,
187 orig_cmp
->dest
.dest
.ssa
.num_components
,
188 orig_cmp
->dest
.dest
.ssa
.bit_size
, NULL
);
189 mov_cmp
->src
[0].src
= nir_src_for_ssa(cmp
);
191 nir_builder_instr_insert(bld
, &mov_cmp
->instr
);
193 nir_ssa_def_rewrite_uses(&orig_cmp
->dest
.dest
.ssa
,
194 nir_src_for_ssa(&mov_cmp
->dest
.dest
.ssa
));
195 nir_ssa_def_rewrite_uses(&orig_add
->dest
.dest
.ssa
,
196 nir_src_for_ssa(&mov_add
->dest
.dest
.ssa
));
198 /* We know these have no more uses because we just rewrote them all, so we
201 nir_instr_remove(&orig_cmp
->instr
);
202 nir_instr_remove(&orig_add
->instr
);
206 comparison_pre_block(nir_block
*block
, struct block_queue
*bq
, nir_builder
*bld
)
208 bool progress
= false;
210 struct block_instructions
*bi
= push_block(bq
);
214 /* Starting with the current block, examine each instruction. If the
215 * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
216 * to the block_instructions::instructions set. If the instruction is an
217 * add instruction, walk up the block queue looking at the stored
218 * instructions. If a matching comparison is found, move the addition and
219 * replace the comparison with a different comparison based on the result
220 * of the addition. All of the blocks in the queue are guaranteed to be
221 * dominators of the current block.
223 * After processing the current block, recurse into the blocks dominated by
226 nir_foreach_instr_safe(instr
, block
) {
227 if (instr
->type
!= nir_instr_type_alu
)
230 struct nir_alu_instr
*const alu
= nir_instr_as_alu(instr
);
232 if (alu
->dest
.dest
.ssa
.num_components
!= 1)
235 if (alu
->dest
.saturate
)
238 static const uint8_t swizzle
[4] = { 0, 0, 0, 0 };
242 /* If the instruction is fadd, check it against comparison
243 * instructions that dominate it.
245 struct block_instructions
*b
=
246 (struct block_instructions
*) exec_list_get_head_raw(&bq
->blocks
);
248 while (b
->node
.next
!= NULL
) {
250 bool rewrote_compare
= false;
252 u_vector_foreach(a
, &b
->instructions
) {
253 nir_alu_instr
*const cmp
= *a
;
258 /* The operands of both instructions are, with some liberty,
259 * commutative. Check all four permutations. The third and
260 * fourth permutations are negations of the first two.
262 if ((nir_alu_srcs_equal(cmp
, alu
, 0, 0) &&
263 nir_alu_srcs_negative_equal(cmp
, alu
, 1, 1)) ||
264 (nir_alu_srcs_equal(cmp
, alu
, 0, 1) &&
265 nir_alu_srcs_negative_equal(cmp
, alu
, 1, 0))) {
266 /* These are the cases where (A cmp B) matches either (A +
269 * A cmp B <=> A + -B cmp 0
271 rewrite_compare_instruction(bld
, cmp
, alu
, false);
274 rewrote_compare
= true;
276 } else if ((nir_alu_srcs_equal(cmp
, alu
, 1, 0) &&
277 nir_alu_srcs_negative_equal(cmp
, alu
, 0, 1)) ||
278 (nir_alu_srcs_equal(cmp
, alu
, 1, 1) &&
279 nir_alu_srcs_negative_equal(cmp
, alu
, 0, 0))) {
280 /* This is the case where (A cmp B) matches (B + -A) or (-A
283 * A cmp B <=> 0 cmp B + -A
285 rewrite_compare_instruction(bld
, cmp
, alu
, true);
288 rewrote_compare
= true;
293 /* Bail after a compare in the most dominating block is found.
294 * This is necessary because 'alu' has been removed from the
295 * instruction stream. Should there be a matching compare in
296 * another block, calling rewrite_compare_instruction again will
297 * try to operate on a node that is not in the list as if it were
300 * FINISHME: There may be opportunity for additional optimization
301 * here. I discovered this problem due to a shader in Guacamelee.
302 * It may be possible to rewrite the matching compares that are
303 * encountered later to reuse the result from the compare that was
304 * first rewritten. It's also possible that this is just taken
305 * care of by calling the optimization pass repeatedly.
307 if (rewrote_compare
) {
312 b
= (struct block_instructions
*) b
->node
.next
;
322 /* If the instruction is a comparison that is used by an if-statement
323 * and neither operand is immediate value 0, add it to the set.
325 if (is_used_by_if(alu
) &&
326 is_not_const_zero(alu
, 0, 1, swizzle
) &&
327 is_not_const_zero(alu
, 1, 1, swizzle
))
328 add_instruction_for_block(bi
, alu
);
337 for (unsigned i
= 0; i
< block
->num_dom_children
; i
++) {
338 nir_block
*child
= block
->dom_children
[i
];
340 if (comparison_pre_block(child
, bq
, bld
))
350 nir_opt_comparison_pre_impl(nir_function_impl
*impl
)
352 struct block_queue bq
;
355 block_queue_init(&bq
);
356 nir_builder_init(&bld
, impl
);
358 nir_metadata_require(impl
, nir_metadata_dominance
);
360 const bool progress
=
361 comparison_pre_block(nir_start_block(impl
), &bq
, &bld
);
363 block_queue_finish(&bq
);
366 nir_metadata_preserve(impl
, nir_metadata_block_index
|
367 nir_metadata_dominance
);
373 nir_opt_comparison_pre(nir_shader
*shader
)
375 bool progress
= false;
377 nir_foreach_function(function
, shader
) {
379 progress
|= nir_opt_comparison_pre_impl(function
->impl
);