2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "nir_builder.h"
26 #include "util/u_vector.h"
29 * Lower flrp instructions.
31 * Unlike the lowerings that are possible in nir_opt_algrbraic, this pass can
32 * examine more global information to determine a possibly more efficient
33 * lowering for each flrp.
37 append_flrp_to_dead_list(struct u_vector
*dead_flrp
, struct nir_alu_instr
*alu
)
39 struct nir_alu_instr
**tail
= u_vector_add(dead_flrp
);
44 * Replace flrp(a, b, c) with ffma(b, c, ffma(-a, c, a)).
47 replace_with_strict_ffma(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
48 struct nir_alu_instr
*alu
)
50 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
51 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
52 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
54 nir_ssa_def
*const neg_a
= nir_fneg(bld
, a
);
55 nir_instr_as_alu(neg_a
->parent_instr
)->exact
= alu
->exact
;
57 nir_ssa_def
*const inner_ffma
= nir_ffma(bld
, neg_a
, c
, a
);
58 nir_instr_as_alu(inner_ffma
->parent_instr
)->exact
= alu
->exact
;
60 nir_ssa_def
*const outer_ffma
= nir_ffma(bld
, b
, c
, inner_ffma
);
61 nir_instr_as_alu(outer_ffma
->parent_instr
)->exact
= alu
->exact
;
63 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(outer_ffma
));
65 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
66 * based on other uses of the sources. Removing the flrp may cause the
67 * last flrp in a sequence to make a different, incorrect choice.
69 append_flrp_to_dead_list(dead_flrp
, alu
);
73 * Replace flrp(a, b, c) with a(1-c) + bc.
76 replace_with_strict(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
77 struct nir_alu_instr
*alu
)
79 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
80 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
81 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
83 nir_ssa_def
*const neg_c
= nir_fneg(bld
, c
);
84 nir_instr_as_alu(neg_c
->parent_instr
)->exact
= alu
->exact
;
86 nir_ssa_def
*const one_minus_c
=
87 nir_fadd(bld
, nir_imm_float(bld
, 1.0f
), neg_c
);
88 nir_instr_as_alu(one_minus_c
->parent_instr
)->exact
= alu
->exact
;
90 nir_ssa_def
*const first_product
= nir_fmul(bld
, a
, one_minus_c
);
91 nir_instr_as_alu(first_product
->parent_instr
)->exact
= alu
->exact
;
93 nir_ssa_def
*const second_product
= nir_fmul(bld
, b
, c
);
94 nir_instr_as_alu(second_product
->parent_instr
)->exact
= alu
->exact
;
96 nir_ssa_def
*const sum
= nir_fadd(bld
, first_product
, second_product
);
97 nir_instr_as_alu(sum
->parent_instr
)->exact
= alu
->exact
;
99 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(sum
));
101 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
102 * based on other uses of the sources. Removing the flrp may cause the
103 * last flrp in a sequence to make a different, incorrect choice.
105 append_flrp_to_dead_list(dead_flrp
, alu
);
109 * Replace flrp(a, b, c) with a + c(b-a).
112 replace_with_fast(struct nir_builder
*bld
, struct u_vector
*dead_flrp
,
113 struct nir_alu_instr
*alu
)
115 nir_ssa_def
*const a
= nir_ssa_for_alu_src(bld
, alu
, 0);
116 nir_ssa_def
*const b
= nir_ssa_for_alu_src(bld
, alu
, 1);
117 nir_ssa_def
*const c
= nir_ssa_for_alu_src(bld
, alu
, 2);
119 nir_ssa_def
*const neg_a
= nir_fneg(bld
, a
);
120 nir_instr_as_alu(neg_a
->parent_instr
)->exact
= alu
->exact
;
122 nir_ssa_def
*const b_minus_a
= nir_fadd(bld
, b
, neg_a
);
123 nir_instr_as_alu(b_minus_a
->parent_instr
)->exact
= alu
->exact
;
125 nir_ssa_def
*const product
= nir_fmul(bld
, c
, b_minus_a
);
126 nir_instr_as_alu(product
->parent_instr
)->exact
= alu
->exact
;
128 nir_ssa_def
*const sum
= nir_fadd(bld
, a
, product
);
129 nir_instr_as_alu(sum
->parent_instr
)->exact
= alu
->exact
;
131 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
, nir_src_for_ssa(sum
));
133 /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are
134 * based on other uses of the sources. Removing the flrp may cause the
135 * last flrp in a sequence to make a different, incorrect choice.
137 append_flrp_to_dead_list(dead_flrp
, alu
);
141 sources_are_constants_with_similar_magnitudes(const nir_alu_instr
*instr
)
143 nir_const_value
*val0
= nir_src_as_const_value(instr
->src
[0].src
);
144 nir_const_value
*val1
= nir_src_as_const_value(instr
->src
[1].src
);
146 if (val0
== NULL
|| val1
== NULL
)
149 const uint8_t *const swizzle0
= instr
->src
[0].swizzle
;
150 const uint8_t *const swizzle1
= instr
->src
[1].swizzle
;
151 const unsigned num_components
= nir_dest_num_components(instr
->dest
.dest
);
153 if (instr
->dest
.dest
.ssa
.bit_size
== 32) {
154 for (unsigned i
= 0; i
< num_components
; i
++) {
158 frexpf(val0
[swizzle0
[i
]].f32
, &exp0
);
159 frexpf(val1
[swizzle1
[i
]].f32
, &exp1
);
161 /* If the difference between exponents is >= 24, then A+B will always
162 * have the value whichever between A and B has the largest absolute
163 * value. So, [0, 23] is the valid range. The smaller the limit
164 * value, the more precision will be maintained at a potential
165 * performance cost. Somewhat arbitrarilly split the range in half.
167 if (abs(exp0
- exp1
) > (23 / 2))
171 for (unsigned i
= 0; i
< num_components
; i
++) {
175 frexp(val0
[swizzle0
[i
]].f64
, &exp0
);
176 frexp(val1
[swizzle1
[i
]].f64
, &exp1
);
178 /* If the difference between exponents is >= 53, then A+B will always
179 * have the value whichever between A and B has the largest absolute
180 * value. So, [0, 52] is the valid range. The smaller the limit
181 * value, the more precision will be maintained at a potential
182 * performance cost. Somewhat arbitrarilly split the range in half.
184 if (abs(exp0
- exp1
) > (52 / 2))
193 convert_flrp_instruction(nir_builder
*bld
,
194 struct u_vector
*dead_flrp
,
199 bld
->cursor
= nir_before_instr(&alu
->instr
);
201 /* There are two methods to implement flrp(x, y, t). The strictly correct
202 * implementation according to the GLSL spec is:
206 * This can also be implemented using two chained FMAs
208 * fma(y, t, fma(-x, t, x))
210 * This method, using either formulation, has better precision when the
211 * difference between x and y is very large. It guarantess that flrp(x, y,
212 * 1) = y. For example, flrp(1e38, 1.0, 1.0) is 1.0. This is correct.
214 * The other possible implementation is:
218 * This can also be formuated as an FMA:
222 * For this implementation, flrp(1e38, 1.0, 1.0) is 0.0. Since 1.0 was
223 * expected, that's a pretty significant error.
225 * The choice made for lowering depends on a number of factors.
227 * - If the flrp is marked precise and FMA is supported:
229 * fma(y, t, fma(-x, t, x))
231 * This is strictly correct (maybe?), and the cost is two FMA
232 * instructions. It at least maintains the flrp(x, y, 1.0) == y
235 * - If the flrp is marked precise and FMA is not supported:
239 * This is strictly correct, and the cost is 4 instructions. If FMA is
240 * supported, this may or may not be reduced to 3 instructions (a
241 * subtract, a multiply, and an FMA)... but in that case the other
242 * formulation should have been used.
246 replace_with_strict_ffma(bld
, dead_flrp
, alu
);
248 replace_with_strict(bld
, dead_flrp
, alu
);
254 * - If x and y are both immediates and the relative magnitude of the
255 * values is similar (such that x-y does not lose too much precision):
259 * We rely on constant folding to eliminate x-y, and we rely on
260 * nir_opt_algebraic to possibly generate an FMA. The cost is either one
261 * FMA or two instructions.
263 if (sources_are_constants_with_similar_magnitudes(alu
)) {
264 replace_with_fast(bld
, dead_flrp
, alu
);
269 if (always_precise
) {
270 replace_with_strict_ffma(bld
, dead_flrp
, alu
);
274 if (always_precise
) {
275 replace_with_strict(bld
, dead_flrp
, alu
);
285 replace_with_fast(bld
, dead_flrp
, alu
);
289 lower_flrp_impl(nir_function_impl
*impl
,
290 struct u_vector
*dead_flrp
,
291 unsigned lowering_mask
,
296 nir_builder_init(&b
, impl
);
298 nir_foreach_block(block
, impl
) {
299 nir_foreach_instr_safe(instr
, block
) {
300 if (instr
->type
== nir_instr_type_alu
) {
301 nir_alu_instr
*const alu
= nir_instr_as_alu(instr
);
303 if (alu
->op
== nir_op_flrp
&&
304 (alu
->dest
.dest
.ssa
.bit_size
& lowering_mask
)) {
305 convert_flrp_instruction(&b
, dead_flrp
, alu
, always_precise
,
312 nir_metadata_preserve(impl
, nir_metadata_block_index
|
313 nir_metadata_dominance
);
317 * \param lowering_mask - Bitwise-or of the bit sizes that need to be lowered
318 * (e.g., 16 | 64 if only 16-bit and 64-bit flrp need
320 * \param always_precise - Always require precise lowering for flrp. This
321 * will always lower flrp to (a * (1 - c)) + (b * c).
322 * \param have_ffma - Set to true if the GPU has an FFMA instruction that
326 nir_lower_flrp(nir_shader
*shader
,
327 unsigned lowering_mask
,
331 struct u_vector dead_flrp
;
333 if (!u_vector_init(&dead_flrp
, sizeof(struct nir_alu_instr
*), 64))
336 nir_foreach_function(function
, shader
) {
337 if (function
->impl
) {
338 lower_flrp_impl(function
->impl
, &dead_flrp
, lowering_mask
,
339 always_precise
, have_ffma
);
343 /* Progress was made if the dead list is not empty. Remove all the
344 * instructions from the dead list.
346 const bool progress
= u_vector_length(&dead_flrp
) != 0;
348 struct nir_alu_instr
**instr
;
349 u_vector_foreach(instr
, &dead_flrp
)
350 nir_instr_remove(&(*instr
)->instr
);
352 u_vector_finish(&dead_flrp
);