2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "nir_builder.h"
26 #include "util/u_vector.h"
29 * Lower flrp instructions.
31 * Unlike the lowerings that are possible in nir_opt_algrbraic, this pass can
32 * examine more global information to determine a possibly more efficient
33 * lowering for each flrp.
37 append_flrp_to_dead_list(struct u_vector
*dead_flrp
, struct nir_alu_instr
*alu
)
39 struct nir_alu_instr
**tail
= u_vector_add(dead_flrp
);
44 * Replace flrp(a, b, c) with ffma(b, c, ffma(-a, c, a)).
47 replace_with_strict_ffma(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
48 struct nir_alu_instr
*alu
)
50 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
51 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
52 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
54 nir_ssa_def
*const neg_a
= nir_fneg(bld
, a
);
55 nir_instr_as_alu(neg_a
->parent_instr
)->exact
= alu
->exact
;
57 nir_ssa_def
*const inner_ffma
= nir_ffma(bld
, neg_a
, c
, a
);
58 nir_instr_as_alu(inner_ffma
->parent_instr
)->exact
= alu
->exact
;
60 nir_ssa_def
*const outer_ffma
= nir_ffma(bld
, b
, c
, inner_ffma
);
61 nir_instr_as_alu(outer_ffma
->parent_instr
)->exact
= alu
->exact
;
63 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(outer_ffma
));
65 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
66 * based on other uses of the sources. Removing the flrp may cause the
67 * last flrp in a sequence to make a different, incorrect choice.
69 append_flrp_to_dead_list(dead_flrp
, alu
);
73 * Replace flrp(a, b, c) with ffma(a, (1 - c), bc)
76 replace_with_single_ffma(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
77 struct nir_alu_instr
*alu
)
79 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
80 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
81 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
83 nir_ssa_def
*const neg_c
= nir_fneg(bld
, c
);
84 nir_instr_as_alu(neg_c
->parent_instr
)->exact
= alu
->exact
;
86 nir_ssa_def
*const one_minus_c
=
87 nir_fadd(bld
, nir_imm_floatN_t(bld
, 1.0f
, c
->bit_size
), neg_c
);
88 nir_instr_as_alu(one_minus_c
->parent_instr
)->exact
= alu
->exact
;
90 nir_ssa_def
*const b_times_c
= nir_fmul(bld
, b
, c
);
91 nir_instr_as_alu(b_times_c
->parent_instr
)->exact
= alu
->exact
;
93 nir_ssa_def
*const final_ffma
= nir_ffma(bld
, a
, one_minus_c
, b_times_c
);
94 nir_instr_as_alu(final_ffma
->parent_instr
)->exact
= alu
->exact
;
96 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(final_ffma
));
98 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
99 * based on other uses of the sources. Removing the flrp may cause the
100 * last flrp in a sequence to make a different, incorrect choice.
102 append_flrp_to_dead_list(dead_flrp
, alu
);
106 * Replace flrp(a, b, c) with a(1-c) + bc.
109 replace_with_strict(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
110 struct nir_alu_instr
*alu
)
112 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
113 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
114 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
116 nir_ssa_def
*const neg_c
= nir_fneg(bld
, c
);
117 nir_instr_as_alu(neg_c
->parent_instr
)->exact
= alu
->exact
;
119 nir_ssa_def
*const one_minus_c
=
120 nir_fadd(bld
, nir_imm_floatN_t(bld
, 1.0f
, c
->bit_size
), neg_c
);
121 nir_instr_as_alu(one_minus_c
->parent_instr
)->exact
= alu
->exact
;
123 nir_ssa_def
*const first_product
= nir_fmul(bld
, a
, one_minus_c
);
124 nir_instr_as_alu(first_product
->parent_instr
)->exact
= alu
->exact
;
126 nir_ssa_def
*const second_product
= nir_fmul(bld
, b
, c
);
127 nir_instr_as_alu(second_product
->parent_instr
)->exact
= alu
->exact
;
129 nir_ssa_def
*const sum
= nir_fadd(bld
, first_product
, second_product
);
130 nir_instr_as_alu(sum
->parent_instr
)->exact
= alu
->exact
;
132 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(sum
));
134 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
135 * based on other uses of the sources. Removing the flrp may cause the
136 * last flrp in a sequence to make a different, incorrect choice.
138 append_flrp_to_dead_list(dead_flrp
, alu
);
142 * Replace flrp(a, b, c) with a + c(b-a).
145 replace_with_fast(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
146 struct nir_alu_instr
*alu
)
148 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
149 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
150 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
152 nir_ssa_def
*const neg_a
= nir_fneg(bld
, a
);
153 nir_instr_as_alu(neg_a
->parent_instr
)->exact
= alu
->exact
;
155 nir_ssa_def
*const b_minus_a
= nir_fadd(bld
, b
, neg_a
);
156 nir_instr_as_alu(b_minus_a
->parent_instr
)->exact
= alu
->exact
;
158 nir_ssa_def
*const product
= nir_fmul(bld
, c
, b_minus_a
);
159 nir_instr_as_alu(product
->parent_instr
)->exact
= alu
->exact
;
161 nir_ssa_def
*const sum
= nir_fadd(bld
, a
, product
);
162 nir_instr_as_alu(sum
->parent_instr
)->exact
= alu
->exact
;
164 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(sum
));
166 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
167 * based on other uses of the sources. Removing the flrp may cause the
168 * last flrp in a sequence to make a different, incorrect choice.
170 append_flrp_to_dead_list(dead_flrp
, alu
);
174 * Replace flrp(a, b, c) with (b*c ± c) + a => b*c + (a ± c)
176 * \note: This only works if a = ±1.
179 replace_with_expanded_ffma_and_add(struct nir_builder
*bld
,
180 struct u_vector
*dead_flrp
,
181 struct nir_alu_instr
*alu
, bool subtract_c
)
183 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
184 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
185 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
187 nir_ssa_def
*const b_times_c
= nir_fmul(bld
, b
, c
);
188 nir_instr_as_alu(b_times_c
->parent_instr
)->exact
= alu
->exact
;
190 nir_ssa_def
*inner_sum
;
193 nir_ssa_def
*const neg_c
= nir_fneg(bld
, c
);
194 nir_instr_as_alu(neg_c
->parent_instr
)->exact
= alu
->exact
;
196 inner_sum
= nir_fadd(bld
, a
, neg_c
);
198 inner_sum
= nir_fadd(bld
, a
, c
);
201 nir_instr_as_alu(inner_sum
->parent_instr
)->exact
= alu
->exact
;
203 nir_ssa_def
*const outer_sum
= nir_fadd(bld
, inner_sum
, b_times_c
);
204 nir_instr_as_alu(outer_sum
->parent_instr
)->exact
= alu
->exact
;
206 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(outer_sum
));
208 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
209 * based on other uses of the sources. Removing the flrp may cause the
210 * last flrp in a sequence to make a different, incorrect choice.
212 append_flrp_to_dead_list(dead_flrp
, alu
);
216 * Determines whether a swizzled source is constant w/ all components the same.
218 * The value of the constant is stored in \c result.
221 * True if all components of the swizzled source are the same constant.
222 * Otherwise false is returned.
225 all_same_constant(const nir_alu_instr
*instr
, unsigned src
, double *result
)
227 nir_const_value
*val
= nir_src_as_const_value(instr
->src
[src
].src
);
232 const uint8_t *const swizzle
= instr
->src
[src
].swizzle
;
233 const unsigned num_components
= nir_dest_num_components(instr
->dest
.dest
);
235 if (instr
->dest
.dest
.ssa
.bit_size
== 32) {
236 const float first
= val
[swizzle
[0]].f32
;
238 for (unsigned i
= 1; i
< num_components
; i
++) {
239 if (val
[swizzle
[i
]].f32
!= first
)
245 const double first
= val
[swizzle
[0]].f64
;
247 for (unsigned i
= 1; i
< num_components
; i
++) {
248 if (val
[swizzle
[i
]].f64
!= first
)
259 sources_are_constants_with_similar_magnitudes(const nir_alu_instr
*instr
)
261 nir_const_value
*val0
= nir_src_as_const_value(instr
->src
[0].src
);
262 nir_const_value
*val1
= nir_src_as_const_value(instr
->src
[1].src
);
264 if (val0
== NULL
|| val1
== NULL
)
267 const uint8_t *const swizzle0
= instr
->src
[0].swizzle
;
268 const uint8_t *const swizzle1
= instr
->src
[1].swizzle
;
269 const unsigned num_components
= nir_dest_num_components(instr
->dest
.dest
);
271 if (instr
->dest
.dest
.ssa
.bit_size
== 32) {
272 for (unsigned i
= 0; i
< num_components
; i
++) {
276 frexpf(val0
[swizzle0
[i
]].f32
, &exp0
);
277 frexpf(val1
[swizzle1
[i
]].f32
, &exp1
);
279 /* If the difference between exponents is >= 24, then A+B will always
280 * have the value whichever between A and B has the largest absolute
281 * value. So, [0, 23] is the valid range. The smaller the limit
282 * value, the more precision will be maintained at a potential
283 * performance cost. Somewhat arbitrarilly split the range in half.
285 if (abs(exp0
- exp1
) > (23 / 2))
289 for (unsigned i
= 0; i
< num_components
; i
++) {
293 frexp(val0
[swizzle0
[i
]].f64
, &exp0
);
294 frexp(val1
[swizzle1
[i
]].f64
, &exp1
);
296 /* If the difference between exponents is >= 53, then A+B will always
297 * have the value whichever between A and B has the largest absolute
298 * value. So, [0, 52] is the valid range. The smaller the limit
299 * value, the more precision will be maintained at a potential
300 * performance cost. Somewhat arbitrarilly split the range in half.
302 if (abs(exp0
- exp1
) > (52 / 2))
311 * Counts of similar types of nir_op_flrp instructions
313 * If a similar instruction fits into more than one category, it will only be
314 * counted once. The assumption is that no other instruction will have all
315 * sources the same, or CSE would have removed one of the instructions.
317 struct similar_flrp_stats
{
319 unsigned src0_and_src2
;
320 unsigned src1_and_src2
;
324 * Collection counts of similar FLRP instructions.
326 * This function only cares about similar instructions that have src2 in
330 get_similar_flrp_stats(nir_alu_instr
*alu
, struct similar_flrp_stats
*st
)
332 memset(st
, 0, sizeof(*st
));
334 nir_foreach_use(other_use
, alu
->src
[2].src
.ssa
) {
335 /* Is the use also a flrp? */
336 nir_instr
*const other_instr
= other_use
->parent_instr
;
337 if (other_instr
->type
!= nir_instr_type_alu
)
340 /* Eh-hem... don't match the instruction with itself. */
341 if (other_instr
== &alu
->instr
)
344 nir_alu_instr
*const other_alu
= nir_instr_as_alu(other_instr
);
345 if (other_alu
->op
!= nir_op_flrp
)
348 /* Does the other flrp use source 2 from the first flrp as its source 2
351 if (!nir_alu_srcs_equal(alu
, other_alu
, 2, 2))
354 if (nir_alu_srcs_equal(alu
, other_alu
, 0, 0))
356 else if (nir_alu_srcs_equal(alu
, other_alu
, 1, 1))
364 convert_flrp_instruction(nir_builder
*bld
,
365 struct u_vector
*dead_flrp
,
370 bld
->cursor
= nir_before_instr(&alu
->instr
);
372 /* There are two methods to implement flrp(x, y, t). The strictly correct
373 * implementation according to the GLSL spec is:
377 * This can also be implemented using two chained FMAs
379 * fma(y, t, fma(-x, t, x))
381 * This method, using either formulation, has better precision when the
382 * difference between x and y is very large. It guarantess that flrp(x, y,
383 * 1) = y. For example, flrp(1e38, 1.0, 1.0) is 1.0. This is correct.
385 * The other possible implementation is:
389 * This can also be formuated as an FMA:
393 * For this implementation, flrp(1e38, 1.0, 1.0) is 0.0. Since 1.0 was
394 * expected, that's a pretty significant error.
396 * The choice made for lowering depends on a number of factors.
398 * - If the flrp is marked precise and FMA is supported:
400 * fma(y, t, fma(-x, t, x))
402 * This is strictly correct (maybe?), and the cost is two FMA
403 * instructions. It at least maintains the flrp(x, y, 1.0) == y
406 * - If the flrp is marked precise and FMA is not supported:
410 * This is strictly correct, and the cost is 4 instructions. If FMA is
411 * supported, this may or may not be reduced to 3 instructions (a
412 * subtract, a multiply, and an FMA)... but in that case the other
413 * formulation should have been used.
417 replace_with_strict_ffma(bld
, dead_flrp
, alu
);
419 replace_with_strict(bld
, dead_flrp
, alu
);
425 * - If x and y are both immediates and the relative magnitude of the
426 * values is similar (such that x-y does not lose too much precision):
430 * We rely on constant folding to eliminate x-y, and we rely on
431 * nir_opt_algebraic to possibly generate an FMA. The cost is either one
432 * FMA or two instructions.
434 if (sources_are_constants_with_similar_magnitudes(alu
)) {
435 replace_with_fast(bld
, dead_flrp
, alu
);
448 * In both cases, x is used in place of ±1 for simplicity. Both forms
449 * lend to ffma generation on platforms that support ffma.
451 double src0_as_constant
;
452 if (all_same_constant(alu
, 0, &src0_as_constant
)) {
453 if (src0_as_constant
== 1.0) {
454 replace_with_expanded_ffma_and_add(bld
, dead_flrp
, alu
,
455 true /* subtract t */);
457 } else if (src0_as_constant
== -1.0) {
458 replace_with_expanded_ffma_and_add(bld
, dead_flrp
, alu
,
469 * In this case either the multiply in yt will be eliminated by
470 * nir_opt_algebraic. If FMA is supported, this results in fma(x, (1 -
471 * t), ±t) for two instructions. If FMA is not supported, then the cost
472 * is 3 instructions. We rely on nir_opt_algebraic to generate the FMA
473 * instructions as well.
475 * Another possible replacement is
479 * Some groupings of this may be better on some platforms in some
480 * circumstances, bit it is probably dependent on scheduling. Futher
481 * investigation may be required.
483 double src1_as_constant
;
484 if ((all_same_constant(alu
, 1, &src1_as_constant
) &&
485 (src1_as_constant
== -1.0 || src1_as_constant
== 1.0))) {
486 replace_with_strict(bld
, dead_flrp
, alu
);
491 if (always_precise
) {
492 replace_with_strict_ffma(bld
, dead_flrp
, alu
);
497 * - If FMA is supported and other flrp(x, _, t) exists:
499 * fma(y, t, fma(-x, t, x))
501 * The hope is that the inner FMA calculation will be shared with the
502 * other lowered flrp. This results in two FMA instructions for the
503 * first flrp and one FMA instruction for each additional flrp. It
504 * also means that the live range for x might be complete after the
505 * inner ffma instead of after the last flrp.
507 struct similar_flrp_stats st
;
509 get_similar_flrp_stats(alu
, &st
);
510 if (st
.src0_and_src2
> 0) {
511 replace_with_strict_ffma(bld
, dead_flrp
, alu
);
516 * - If FMA is supported and another flrp(_, y, t) exists:
518 * fma(x, (1 - t), yt)
520 * The hope is that the (1 - t) and the yt will be shared with the
521 * other lowered flrp. This results in 3 insructions for the first
522 * flrp and 1 for each additional flrp.
524 if (st
.src1_and_src2
> 0) {
525 replace_with_single_ffma(bld
, dead_flrp
, alu
);
529 if (always_precise
) {
530 replace_with_strict(bld
, dead_flrp
, alu
);
535 * - If FMA is not supported and another flrp(x, _, t) exists:
539 * The hope is that the x(1 - t) will be shared with the other lowered
540 * flrp. This results in 4 insructions for the first flrp and 2 for
541 * each additional flrp.
543 * - If FMA is not supported and another flrp(_, y, t) exists:
547 * The hope is that the (1 - t) and the yt will be shared with the
548 * other lowered flrp. This results in 4 insructions for the first
549 * flrp and 2 for each additional flrp.
551 struct similar_flrp_stats st
;
553 get_similar_flrp_stats(alu
, &st
);
554 if (st
.src0_and_src2
> 0 || st
.src1_and_src2
> 0) {
555 replace_with_strict(bld
, dead_flrp
, alu
);
561 * - If t is constant:
565 * The cost is three instructions without FMA or two instructions with
566 * FMA. This is the same cost as the imprecise lowering, but it gives
567 * the instruction scheduler a little more freedom.
569 * There is no need to handle t = 0.5 specially. nir_opt_algebraic
570 * already has optimizations to convert 0.5x + 0.5y to 0.5(x + y).
572 if (alu
->src
[2].src
.ssa
->parent_instr
->type
== nir_instr_type_load_const
) {
573 replace_with_strict(bld
, dead_flrp
, alu
);
582 replace_with_fast(bld
, dead_flrp
, alu
);
586 lower_flrp_impl(nir_function_impl
*impl
,
587 struct u_vector
*dead_flrp
,
588 unsigned lowering_mask
,
593 nir_builder_init(&b
, impl
);
595 nir_foreach_block(block
, impl
) {
596 nir_foreach_instr_safe(instr
, block
) {
597 if (instr
->type
== nir_instr_type_alu
) {
598 nir_alu_instr
*const alu
= nir_instr_as_alu(instr
);
600 if (alu
->op
== nir_op_flrp
&&
601 (alu
->dest
.dest
.ssa
.bit_size
& lowering_mask
)) {
602 convert_flrp_instruction(&b
, dead_flrp
, alu
, always_precise
,
609 nir_metadata_preserve(impl
, nir_metadata_block_index
|
610 nir_metadata_dominance
);
614 * \param lowering_mask - Bitwise-or of the bit sizes that need to be lowered
615 * (e.g., 16 | 64 if only 16-bit and 64-bit flrp need
617 * \param always_precise - Always require precise lowering for flrp. This
618 * will always lower flrp to (a * (1 - c)) + (b * c).
619 * \param have_ffma - Set to true if the GPU has an FFMA instruction that
623 nir_lower_flrp(nir_shader
*shader
,
624 unsigned lowering_mask
,
628 struct u_vector dead_flrp
;
630 if (!u_vector_init(&dead_flrp
, sizeof(struct nir_alu_instr
*), 64))
633 nir_foreach_function(function
, shader
) {
634 if (function
->impl
) {
635 lower_flrp_impl(function
->impl
, &dead_flrp
, lowering_mask
,
636 always_precise
, have_ffma
);
640 /* Progress was made if the dead list is not empty. Remove all the
641 * instructions from the dead list.
643 const bool progress
= u_vector_length(&dead_flrp
) != 0;
645 struct nir_alu_instr
**instr
;
646 u_vector_foreach(instr
, &dead_flrp
)
647 nir_instr_remove(&(*instr
)->instr
);
649 u_vector_finish(&dead_flrp
);