nir/algebraic: mark some optimizations with fsat(NaN) as inexact
[mesa.git] / src / compiler / nir / nir_builtin_builder.c
1 /*
2 * Copyright © 2018 Red Hat Inc.
3 * Copyright © 2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include <math.h>
26
27 #include "nir.h"
28 #include "nir_builtin_builder.h"
29
30 nir_ssa_def*
31 nir_cross3(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
32 {
33 unsigned yzx[3] = { 1, 2, 0 };
34 unsigned zxy[3] = { 2, 0, 1 };
35
36 return nir_fsub(b, nir_fmul(b, nir_swizzle(b, x, yzx, 3),
37 nir_swizzle(b, y, zxy, 3)),
38 nir_fmul(b, nir_swizzle(b, x, zxy, 3),
39 nir_swizzle(b, y, yzx, 3)));
40 }
41
42 nir_ssa_def*
43 nir_cross4(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
44 {
45 nir_ssa_def *cross = nir_cross3(b, x, y);
46
47 return nir_vec4(b,
48 nir_channel(b, cross, 0),
49 nir_channel(b, cross, 1),
50 nir_channel(b, cross, 2),
51 nir_imm_intN_t(b, 0, cross->bit_size));
52 }
53
54 nir_ssa_def*
55 nir_length(nir_builder *b, nir_ssa_def *vec)
56 {
57 nir_ssa_def *finf = nir_imm_floatN_t(b, INFINITY, vec->bit_size);
58
59 nir_ssa_def *abs = nir_fabs(b, vec);
60 if (vec->num_components == 1)
61 return abs;
62
63 nir_ssa_def *maxc = nir_fmax_abs_vec_comp(b, abs);
64 abs = nir_fdiv(b, abs, maxc);
65 nir_ssa_def *res = nir_fmul(b, nir_fsqrt(b, nir_fdot(b, abs, abs)), maxc);
66 return nir_bcsel(b, nir_feq(b, maxc, finf), maxc, res);
67 }
68
69 nir_ssa_def*
70 nir_fast_length(nir_builder *b, nir_ssa_def *vec)
71 {
72 switch (vec->num_components) {
73 case 1: return nir_fsqrt(b, nir_fmul(b, vec, vec));
74 case 2: return nir_fsqrt(b, nir_fdot2(b, vec, vec));
75 case 3: return nir_fsqrt(b, nir_fdot3(b, vec, vec));
76 case 4: return nir_fsqrt(b, nir_fdot4(b, vec, vec));
77 default:
78 unreachable("Invalid number of components");
79 }
80 }
81
82 nir_ssa_def*
83 nir_nextafter(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
84 {
85 nir_ssa_def *zero = nir_imm_intN_t(b, 0, x->bit_size);
86 nir_ssa_def *one = nir_imm_intN_t(b, 1, x->bit_size);
87
88 nir_ssa_def *condeq = nir_feq(b, x, y);
89 nir_ssa_def *conddir = nir_flt(b, x, y);
90 nir_ssa_def *condzero = nir_feq(b, x, zero);
91
92 /* beware of: +/-0.0 - 1 == NaN */
93 nir_ssa_def *xn =
94 nir_bcsel(b,
95 condzero,
96 nir_imm_intN_t(b, (1 << (x->bit_size - 1)) + 1, x->bit_size),
97 nir_isub(b, x, one));
98
99 /* beware of -0.0 + 1 == -0x1p-149 */
100 nir_ssa_def *xp = nir_bcsel(b, condzero, one, nir_iadd(b, x, one));
101
102 /* nextafter can be implemented by just +/- 1 on the int value */
103 nir_ssa_def *res =
104 nir_bcsel(b, nir_ixor(b, conddir, nir_flt(b, x, zero)), xp, xn);
105
106 return nir_nan_check2(b, x, y, nir_bcsel(b, condeq, x, res));
107 }
108
109 nir_ssa_def*
110 nir_normalize(nir_builder *b, nir_ssa_def *vec)
111 {
112 if (vec->num_components == 1)
113 return nir_fsign(b, vec);
114
115 nir_ssa_def *f0 = nir_imm_floatN_t(b, 0.0, vec->bit_size);
116 nir_ssa_def *f1 = nir_imm_floatN_t(b, 1.0, vec->bit_size);
117 nir_ssa_def *finf = nir_imm_floatN_t(b, INFINITY, vec->bit_size);
118
119 /* scale the input to increase precision */
120 nir_ssa_def *maxc = nir_fmax_abs_vec_comp(b, vec);
121 nir_ssa_def *svec = nir_fdiv(b, vec, maxc);
122 /* for inf */
123 nir_ssa_def *finfvec = nir_copysign(b, nir_bcsel(b, nir_feq(b, vec, finf), f1, f0), f1);
124
125 nir_ssa_def *temp = nir_bcsel(b, nir_feq(b, maxc, finf), finfvec, svec);
126 nir_ssa_def *res = nir_fmul(b, temp, nir_frsq(b, nir_fdot(b, temp, temp)));
127
128 return nir_bcsel(b, nir_feq(b, maxc, f0), vec, res);
129 }
130
131 nir_ssa_def*
132 nir_rotate(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
133 {
134 nir_ssa_def *shift_mask = nir_imm_int(b, x->bit_size - 1);
135
136 if (y->bit_size != 32)
137 y = nir_u2u32(b, y);
138
139 nir_ssa_def *lshift = nir_iand(b, y, shift_mask);
140 nir_ssa_def *rshift = nir_isub(b, nir_imm_int(b, x->bit_size), lshift);
141
142 nir_ssa_def *hi = nir_ishl(b, x, lshift);
143 nir_ssa_def *lo = nir_ushr(b, x, rshift);
144
145 return nir_ior(b, hi, lo);
146 }
147
148 nir_ssa_def*
149 nir_smoothstep(nir_builder *b, nir_ssa_def *edge0, nir_ssa_def *edge1, nir_ssa_def *x)
150 {
151 nir_ssa_def *f2 = nir_imm_floatN_t(b, 2.0, x->bit_size);
152 nir_ssa_def *f3 = nir_imm_floatN_t(b, 3.0, x->bit_size);
153
154 /* t = clamp((x - edge0) / (edge1 - edge0), 0, 1) */
155 nir_ssa_def *t =
156 nir_fsat(b, nir_fdiv(b, nir_fsub(b, x, edge0),
157 nir_fsub(b, edge1, edge0)));
158
159 /* result = t * t * (3 - 2 * t) */
160 return nir_fmul(b, t, nir_fmul(b, t, nir_fsub(b, f3, nir_fmul(b, f2, t))));
161 }
162
163 nir_ssa_def*
164 nir_upsample(nir_builder *b, nir_ssa_def *hi, nir_ssa_def *lo)
165 {
166 assert(lo->num_components == hi->num_components);
167 assert(lo->bit_size == hi->bit_size);
168
169 nir_ssa_def *res[NIR_MAX_VEC_COMPONENTS];
170 for (unsigned i = 0; i < lo->num_components; ++i) {
171 nir_ssa_def *vec = nir_vec2(b, nir_channel(b, lo, i), nir_channel(b, hi, i));
172 res[i] = nir_pack_bits(b, vec, vec->bit_size * 2);
173 }
174
175 return nir_vec(b, res, lo->num_components);
176 }
177
178 /**
179 * Compute xs[0] + xs[1] + xs[2] + ... using fadd.
180 */
181 static nir_ssa_def *
182 build_fsum(nir_builder *b, nir_ssa_def **xs, int terms)
183 {
184 nir_ssa_def *accum = xs[0];
185
186 for (int i = 1; i < terms; i++)
187 accum = nir_fadd(b, accum, xs[i]);
188
189 return accum;
190 }
191
192 nir_ssa_def *
193 nir_atan(nir_builder *b, nir_ssa_def *y_over_x)
194 {
195 const uint32_t bit_size = y_over_x->bit_size;
196
197 nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x);
198 nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, bit_size);
199
200 /*
201 * range-reduction, first step:
202 *
203 * / y_over_x if |y_over_x| <= 1.0;
204 * x = <
205 * \ 1.0 / y_over_x otherwise
206 */
207 nir_ssa_def *x = nir_fdiv(b, nir_fmin(b, abs_y_over_x, one),
208 nir_fmax(b, abs_y_over_x, one));
209
210 /*
211 * approximate atan by evaluating polynomial:
212 *
213 * x * 0.9999793128310355 - x^3 * 0.3326756418091246 +
214 * x^5 * 0.1938924977115610 - x^7 * 0.1173503194786851 +
215 * x^9 * 0.0536813784310406 - x^11 * 0.0121323213173444
216 */
217 nir_ssa_def *x_2 = nir_fmul(b, x, x);
218 nir_ssa_def *x_3 = nir_fmul(b, x_2, x);
219 nir_ssa_def *x_5 = nir_fmul(b, x_3, x_2);
220 nir_ssa_def *x_7 = nir_fmul(b, x_5, x_2);
221 nir_ssa_def *x_9 = nir_fmul(b, x_7, x_2);
222 nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2);
223
224 nir_ssa_def *polynomial_terms[] = {
225 nir_fmul_imm(b, x, 0.9999793128310355f),
226 nir_fmul_imm(b, x_3, -0.3326756418091246f),
227 nir_fmul_imm(b, x_5, 0.1938924977115610f),
228 nir_fmul_imm(b, x_7, -0.1173503194786851f),
229 nir_fmul_imm(b, x_9, 0.0536813784310406f),
230 nir_fmul_imm(b, x_11, -0.0121323213173444f),
231 };
232
233 nir_ssa_def *tmp =
234 build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms));
235
236 /* range-reduction fixup */
237 tmp = nir_fadd(b, tmp,
238 nir_fmul(b, nir_b2f(b, nir_flt(b, one, abs_y_over_x), bit_size),
239 nir_fadd_imm(b, nir_fmul_imm(b, tmp, -2.0f), M_PI_2)));
240
241 /* sign fixup */
242 return nir_fmul(b, tmp, nir_fsign(b, y_over_x));
243 }
244
245 nir_ssa_def *
246 nir_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x)
247 {
248 assert(y->bit_size == x->bit_size);
249 const uint32_t bit_size = x->bit_size;
250
251 nir_ssa_def *zero = nir_imm_floatN_t(b, 0, bit_size);
252 nir_ssa_def *one = nir_imm_floatN_t(b, 1, bit_size);
253
254 /* If we're on the left half-plane rotate the coordinates π/2 clock-wise
255 * for the y=0 discontinuity to end up aligned with the vertical
256 * discontinuity of atan(s/t) along t=0. This also makes sure that we
257 * don't attempt to divide by zero along the vertical line, which may give
258 * unspecified results on non-GLSL 4.1-capable hardware.
259 */
260 nir_ssa_def *flip = nir_fge(b, zero, x);
261 nir_ssa_def *s = nir_bcsel(b, flip, nir_fabs(b, x), y);
262 nir_ssa_def *t = nir_bcsel(b, flip, y, nir_fabs(b, x));
263
264 /* If the magnitude of the denominator exceeds some huge value, scale down
265 * the arguments in order to prevent the reciprocal operation from flushing
266 * its result to zero, which would cause precision problems, and for s
267 * infinite would cause us to return a NaN instead of the correct finite
268 * value.
269 *
270 * If fmin and fmax are respectively the smallest and largest positive
271 * normalized floating point values representable by the implementation,
272 * the constants below should be in agreement with:
273 *
274 * huge <= 1 / fmin
275 * scale <= 1 / fmin / fmax (for |t| >= huge)
276 *
277 * In addition scale should be a negative power of two in order to avoid
278 * loss of precision. The values chosen below should work for most usual
279 * floating point representations with at least the dynamic range of ATI's
280 * 24-bit representation.
281 */
282 const double huge_val = bit_size >= 32 ? 1e18 : 16384;
283 nir_ssa_def *huge = nir_imm_floatN_t(b, huge_val, bit_size);
284 nir_ssa_def *scale = nir_bcsel(b, nir_fge(b, nir_fabs(b, t), huge),
285 nir_imm_floatN_t(b, 0.25, bit_size), one);
286 nir_ssa_def *rcp_scaled_t = nir_frcp(b, nir_fmul(b, t, scale));
287 nir_ssa_def *s_over_t = nir_fmul(b, nir_fmul(b, s, scale), rcp_scaled_t);
288
289 /* For |x| = |y| assume tan = 1 even if infinite (i.e. pretend momentarily
290 * that ∞/∞ = 1) in order to comply with the rather artificial rules
291 * inherited from IEEE 754-2008, namely:
292 *
293 * "atan2(±∞, −∞) is ±3π/4
294 * atan2(±∞, +∞) is ±π/4"
295 *
296 * Note that this is inconsistent with the rules for the neighborhood of
297 * zero that are based on iterated limits:
298 *
299 * "atan2(±0, −0) is ±π
300 * atan2(±0, +0) is ±0"
301 *
302 * but GLSL specifically allows implementations to deviate from IEEE rules
303 * at (0,0), so we take that license (i.e. pretend that 0/0 = 1 here as
304 * well).
305 */
306 nir_ssa_def *tan = nir_bcsel(b, nir_feq(b, nir_fabs(b, x), nir_fabs(b, y)),
307 one, nir_fabs(b, s_over_t));
308
309 /* Calculate the arctangent and fix up the result if we had flipped the
310 * coordinate system.
311 */
312 nir_ssa_def *arc =
313 nir_fadd(b, nir_fmul_imm(b, nir_b2f(b, flip, bit_size), M_PI_2),
314 nir_atan(b, tan));
315
316 /* Rather convoluted calculation of the sign of the result. When x < 0 we
317 * cannot use fsign because we need to be able to distinguish between
318 * negative and positive zero. We don't use bitwise arithmetic tricks for
319 * consistency with the GLSL front-end. When x >= 0 rcp_scaled_t will
320 * always be non-negative so this won't be able to distinguish between
321 * negative and positive zero, but we don't care because atan2 is
322 * continuous along the whole positive y = 0 half-line, so it won't affect
323 * the result significantly.
324 */
325 return nir_bcsel(b, nir_flt(b, nir_fmin(b, y, rcp_scaled_t), zero),
326 nir_fneg(b, arc), arc);
327 }
328
329 nir_ssa_def *
330 nir_get_texture_size(nir_builder *b, nir_tex_instr *tex)
331 {
332 b->cursor = nir_before_instr(&tex->instr);
333
334 nir_tex_instr *txs;
335
336 unsigned num_srcs = 1; /* One for the LOD */
337 for (unsigned i = 0; i < tex->num_srcs; i++) {
338 if (tex->src[i].src_type == nir_tex_src_texture_deref ||
339 tex->src[i].src_type == nir_tex_src_sampler_deref ||
340 tex->src[i].src_type == nir_tex_src_texture_offset ||
341 tex->src[i].src_type == nir_tex_src_sampler_offset ||
342 tex->src[i].src_type == nir_tex_src_texture_handle ||
343 tex->src[i].src_type == nir_tex_src_sampler_handle)
344 num_srcs++;
345 }
346
347 txs = nir_tex_instr_create(b->shader, num_srcs);
348 txs->op = nir_texop_txs;
349 txs->sampler_dim = tex->sampler_dim;
350 txs->is_array = tex->is_array;
351 txs->is_shadow = tex->is_shadow;
352 txs->is_new_style_shadow = tex->is_new_style_shadow;
353 txs->texture_index = tex->texture_index;
354 txs->sampler_index = tex->sampler_index;
355 txs->dest_type = nir_type_int;
356
357 unsigned idx = 0;
358 for (unsigned i = 0; i < tex->num_srcs; i++) {
359 if (tex->src[i].src_type == nir_tex_src_texture_deref ||
360 tex->src[i].src_type == nir_tex_src_sampler_deref ||
361 tex->src[i].src_type == nir_tex_src_texture_offset ||
362 tex->src[i].src_type == nir_tex_src_sampler_offset ||
363 tex->src[i].src_type == nir_tex_src_texture_handle ||
364 tex->src[i].src_type == nir_tex_src_sampler_handle) {
365 nir_src_copy(&txs->src[idx].src, &tex->src[i].src, txs);
366 txs->src[idx].src_type = tex->src[i].src_type;
367 idx++;
368 }
369 }
370 /* Add in an LOD because some back-ends require it */
371 txs->src[idx].src = nir_src_for_ssa(nir_imm_int(b, 0));
372 txs->src[idx].src_type = nir_tex_src_lod;
373
374 nir_ssa_dest_init(&txs->instr, &txs->dest,
375 nir_tex_instr_dest_size(txs), 32, NULL);
376 nir_builder_instr_insert(b, &txs->instr);
377
378 return &txs->dest.ssa;
379 }
380
381 nir_ssa_def *
382 nir_get_texture_lod(nir_builder *b, nir_tex_instr *tex)
383 {
384 b->cursor = nir_before_instr(&tex->instr);
385
386 nir_tex_instr *tql;
387
388 unsigned num_srcs = 0;
389 for (unsigned i = 0; i < tex->num_srcs; i++) {
390 if (tex->src[i].src_type == nir_tex_src_coord ||
391 tex->src[i].src_type == nir_tex_src_texture_deref ||
392 tex->src[i].src_type == nir_tex_src_sampler_deref ||
393 tex->src[i].src_type == nir_tex_src_texture_offset ||
394 tex->src[i].src_type == nir_tex_src_sampler_offset ||
395 tex->src[i].src_type == nir_tex_src_texture_handle ||
396 tex->src[i].src_type == nir_tex_src_sampler_handle)
397 num_srcs++;
398 }
399
400 tql = nir_tex_instr_create(b->shader, num_srcs);
401 tql->op = nir_texop_lod;
402 tql->coord_components = tex->coord_components;
403 tql->sampler_dim = tex->sampler_dim;
404 tql->is_array = tex->is_array;
405 tql->is_shadow = tex->is_shadow;
406 tql->is_new_style_shadow = tex->is_new_style_shadow;
407 tql->texture_index = tex->texture_index;
408 tql->sampler_index = tex->sampler_index;
409 tql->dest_type = nir_type_float;
410
411 unsigned idx = 0;
412 for (unsigned i = 0; i < tex->num_srcs; i++) {
413 if (tex->src[i].src_type == nir_tex_src_coord ||
414 tex->src[i].src_type == nir_tex_src_texture_deref ||
415 tex->src[i].src_type == nir_tex_src_sampler_deref ||
416 tex->src[i].src_type == nir_tex_src_texture_offset ||
417 tex->src[i].src_type == nir_tex_src_sampler_offset ||
418 tex->src[i].src_type == nir_tex_src_texture_handle ||
419 tex->src[i].src_type == nir_tex_src_sampler_handle) {
420 nir_src_copy(&tql->src[idx].src, &tex->src[i].src, tql);
421 tql->src[idx].src_type = tex->src[i].src_type;
422 idx++;
423 }
424 }
425
426 nir_ssa_dest_init(&tql->instr, &tql->dest, 2, 32, NULL);
427 nir_builder_instr_insert(b, &tql->instr);
428
429 /* The LOD is the y component of the result */
430 return nir_channel(b, &tql->dest.ssa, 1);
431 }