2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Jason Ekstrand (jason@jlekstrand.net)
30 #include "nir/nir_builtin_builder.h"
32 #include "vtn_private.h"
33 #include "GLSL.std.450.h"
35 #define M_PIf ((float) M_PI)
36 #define M_PI_2f ((float) M_PI_2)
37 #define M_PI_4f ((float) M_PI_4)
40 build_mat2_det(nir_builder
*b
, nir_ssa_def
*col
[2])
42 unsigned swiz
[2] = {1, 0 };
43 nir_ssa_def
*p
= nir_fmul(b
, col
[0], nir_swizzle(b
, col
[1], swiz
, 2, true));
44 return nir_fsub(b
, nir_channel(b
, p
, 0), nir_channel(b
, p
, 1));
48 build_mat3_det(nir_builder
*b
, nir_ssa_def
*col
[3])
50 unsigned yzx
[3] = {1, 2, 0 };
51 unsigned zxy
[3] = {2, 0, 1 };
55 nir_fmul(b
, nir_swizzle(b
, col
[1], yzx
, 3, true),
56 nir_swizzle(b
, col
[2], zxy
, 3, true)));
59 nir_fmul(b
, nir_swizzle(b
, col
[1], zxy
, 3, true),
60 nir_swizzle(b
, col
[2], yzx
, 3, true)));
62 nir_ssa_def
*diff
= nir_fsub(b
, prod0
, prod1
);
64 return nir_fadd(b
, nir_channel(b
, diff
, 0),
65 nir_fadd(b
, nir_channel(b
, diff
, 1),
66 nir_channel(b
, diff
, 2)));
70 build_mat4_det(nir_builder
*b
, nir_ssa_def
**col
)
72 nir_ssa_def
*subdet
[4];
73 for (unsigned i
= 0; i
< 4; i
++) {
75 for (unsigned j
= 0; j
< 3; j
++)
76 swiz
[j
] = j
+ (j
>= i
);
78 nir_ssa_def
*subcol
[3];
79 subcol
[0] = nir_swizzle(b
, col
[1], swiz
, 3, true);
80 subcol
[1] = nir_swizzle(b
, col
[2], swiz
, 3, true);
81 subcol
[2] = nir_swizzle(b
, col
[3], swiz
, 3, true);
83 subdet
[i
] = build_mat3_det(b
, subcol
);
86 nir_ssa_def
*prod
= nir_fmul(b
, col
[0], nir_vec(b
, subdet
, 4));
88 return nir_fadd(b
, nir_fsub(b
, nir_channel(b
, prod
, 0),
89 nir_channel(b
, prod
, 1)),
90 nir_fsub(b
, nir_channel(b
, prod
, 2),
91 nir_channel(b
, prod
, 3)));
95 build_mat_det(struct vtn_builder
*b
, struct vtn_ssa_value
*src
)
97 unsigned size
= glsl_get_vector_elements(src
->type
);
100 for (unsigned i
= 0; i
< size
; i
++)
101 cols
[i
] = src
->elems
[i
]->def
;
104 case 2: return build_mat2_det(&b
->nb
, cols
);
105 case 3: return build_mat3_det(&b
->nb
, cols
);
106 case 4: return build_mat4_det(&b
->nb
, cols
);
108 vtn_fail("Invalid matrix size");
112 /* Computes the determinate of the submatrix given by taking src and
113 * removing the specified row and column.
116 build_mat_subdet(struct nir_builder
*b
, struct vtn_ssa_value
*src
,
117 unsigned size
, unsigned row
, unsigned col
)
119 assert(row
< size
&& col
< size
);
121 return nir_channel(b
, src
->elems
[1 - col
]->def
, 1 - row
);
123 /* Swizzle to get all but the specified row */
125 for (unsigned j
= 0; j
< 3; j
++)
126 swiz
[j
] = j
+ (j
>= row
);
128 /* Grab all but the specified column */
129 nir_ssa_def
*subcol
[3];
130 for (unsigned j
= 0; j
< size
; j
++) {
132 subcol
[j
- (j
> col
)] = nir_swizzle(b
, src
->elems
[j
]->def
,
133 swiz
, size
- 1, true);
138 return build_mat2_det(b
, subcol
);
141 return build_mat3_det(b
, subcol
);
146 static struct vtn_ssa_value
*
147 matrix_inverse(struct vtn_builder
*b
, struct vtn_ssa_value
*src
)
149 nir_ssa_def
*adj_col
[4];
150 unsigned size
= glsl_get_vector_elements(src
->type
);
152 /* Build up an adjugate matrix */
153 for (unsigned c
= 0; c
< size
; c
++) {
154 nir_ssa_def
*elem
[4];
155 for (unsigned r
= 0; r
< size
; r
++) {
156 elem
[r
] = build_mat_subdet(&b
->nb
, src
, size
, c
, r
);
159 elem
[r
] = nir_fneg(&b
->nb
, elem
[r
]);
162 adj_col
[c
] = nir_vec(&b
->nb
, elem
, size
);
165 nir_ssa_def
*det_inv
= nir_frcp(&b
->nb
, build_mat_det(b
, src
));
167 struct vtn_ssa_value
*val
= vtn_create_ssa_value(b
, src
->type
);
168 for (unsigned i
= 0; i
< size
; i
++)
169 val
->elems
[i
]->def
= nir_fmul(&b
->nb
, adj_col
[i
], det_inv
);
178 build_exp(nir_builder
*b
, nir_ssa_def
*x
)
180 return nir_fexp2(b
, nir_fmul_imm(b
, x
, M_LOG2E
));
184 * Return ln(x) - the natural logarithm of x.
187 build_log(nir_builder
*b
, nir_ssa_def
*x
)
189 return nir_fmul_imm(b
, nir_flog2(b
, x
), 1.0 / M_LOG2E
);
193 * Approximate asin(x) by the formula:
194 * asin~(x) = sign(x) * (pi/2 - sqrt(1 - |x|) * (pi/2 + |x|(pi/4 - 1 + |x|(p0 + |x|p1))))
196 * which is correct to first order at x=0 and x=±1 regardless of the p
197 * coefficients but can be made second-order correct at both ends by selecting
198 * the fit coefficients appropriately. Different p coefficients can be used
199 * in the asin and acos implementation to minimize some relative error metric
203 build_asin(nir_builder
*b
, nir_ssa_def
*x
, float p0
, float p1
)
205 nir_ssa_def
*one
= nir_imm_floatN_t(b
, 1.0f
, x
->bit_size
);
206 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
208 nir_ssa_def
*p0_plus_xp1
= nir_fadd_imm(b
, nir_fmul_imm(b
, abs_x
, p1
), p0
);
210 nir_ssa_def
*expr_tail
=
211 nir_fadd_imm(b
, nir_fmul(b
, abs_x
,
212 nir_fadd_imm(b
, nir_fmul(b
, abs_x
,
217 return nir_fmul(b
, nir_fsign(b
, x
),
218 nir_fsub(b
, nir_imm_floatN_t(b
, M_PI_2f
, x
->bit_size
),
219 nir_fmul(b
, nir_fsqrt(b
, nir_fsub(b
, one
, abs_x
)),
224 * Compute xs[0] + xs[1] + xs[2] + ... using fadd.
227 build_fsum(nir_builder
*b
, nir_ssa_def
**xs
, int terms
)
229 nir_ssa_def
*accum
= xs
[0];
231 for (int i
= 1; i
< terms
; i
++)
232 accum
= nir_fadd(b
, accum
, xs
[i
]);
238 build_atan(nir_builder
*b
, nir_ssa_def
*y_over_x
)
240 const uint32_t bit_size
= y_over_x
->bit_size
;
242 nir_ssa_def
*abs_y_over_x
= nir_fabs(b
, y_over_x
);
243 nir_ssa_def
*one
= nir_imm_floatN_t(b
, 1.0f
, bit_size
);
246 * range-reduction, first step:
248 * / y_over_x if |y_over_x| <= 1.0;
250 * \ 1.0 / y_over_x otherwise
252 nir_ssa_def
*x
= nir_fdiv(b
, nir_fmin(b
, abs_y_over_x
, one
),
253 nir_fmax(b
, abs_y_over_x
, one
));
256 * approximate atan by evaluating polynomial:
258 * x * 0.9999793128310355 - x^3 * 0.3326756418091246 +
259 * x^5 * 0.1938924977115610 - x^7 * 0.1173503194786851 +
260 * x^9 * 0.0536813784310406 - x^11 * 0.0121323213173444
262 nir_ssa_def
*x_2
= nir_fmul(b
, x
, x
);
263 nir_ssa_def
*x_3
= nir_fmul(b
, x_2
, x
);
264 nir_ssa_def
*x_5
= nir_fmul(b
, x_3
, x_2
);
265 nir_ssa_def
*x_7
= nir_fmul(b
, x_5
, x_2
);
266 nir_ssa_def
*x_9
= nir_fmul(b
, x_7
, x_2
);
267 nir_ssa_def
*x_11
= nir_fmul(b
, x_9
, x_2
);
269 nir_ssa_def
*polynomial_terms
[] = {
270 nir_fmul_imm(b
, x
, 0.9999793128310355f
),
271 nir_fmul_imm(b
, x_3
, -0.3326756418091246f
),
272 nir_fmul_imm(b
, x_5
, 0.1938924977115610f
),
273 nir_fmul_imm(b
, x_7
, -0.1173503194786851f
),
274 nir_fmul_imm(b
, x_9
, 0.0536813784310406f
),
275 nir_fmul_imm(b
, x_11
, -0.0121323213173444f
),
279 build_fsum(b
, polynomial_terms
, ARRAY_SIZE(polynomial_terms
));
281 /* range-reduction fixup */
282 tmp
= nir_fadd(b
, tmp
,
283 nir_fmul(b
, nir_b2f(b
, nir_flt(b
, one
, abs_y_over_x
), bit_size
),
284 nir_fadd_imm(b
, nir_fmul_imm(b
, tmp
, -2.0f
), M_PI_2f
)));
287 return nir_fmul(b
, tmp
, nir_fsign(b
, y_over_x
));
291 build_atan2(nir_builder
*b
, nir_ssa_def
*y
, nir_ssa_def
*x
)
293 assert(y
->bit_size
== x
->bit_size
);
294 const uint32_t bit_size
= x
->bit_size
;
296 nir_ssa_def
*zero
= nir_imm_floatN_t(b
, 0, bit_size
);
297 nir_ssa_def
*one
= nir_imm_floatN_t(b
, 1, bit_size
);
299 /* If we're on the left half-plane rotate the coordinates π/2 clock-wise
300 * for the y=0 discontinuity to end up aligned with the vertical
301 * discontinuity of atan(s/t) along t=0. This also makes sure that we
302 * don't attempt to divide by zero along the vertical line, which may give
303 * unspecified results on non-GLSL 4.1-capable hardware.
305 nir_ssa_def
*flip
= nir_fge(b
, zero
, x
);
306 nir_ssa_def
*s
= nir_bcsel(b
, flip
, nir_fabs(b
, x
), y
);
307 nir_ssa_def
*t
= nir_bcsel(b
, flip
, y
, nir_fabs(b
, x
));
309 /* If the magnitude of the denominator exceeds some huge value, scale down
310 * the arguments in order to prevent the reciprocal operation from flushing
311 * its result to zero, which would cause precision problems, and for s
312 * infinite would cause us to return a NaN instead of the correct finite
315 * If fmin and fmax are respectively the smallest and largest positive
316 * normalized floating point values representable by the implementation,
317 * the constants below should be in agreement with:
320 * scale <= 1 / fmin / fmax (for |t| >= huge)
322 * In addition scale should be a negative power of two in order to avoid
323 * loss of precision. The values chosen below should work for most usual
324 * floating point representations with at least the dynamic range of ATI's
325 * 24-bit representation.
327 const double huge_val
= bit_size
>= 32 ? 1e18
: 16384;
328 nir_ssa_def
*huge
= nir_imm_floatN_t(b
, huge_val
, bit_size
);
329 nir_ssa_def
*scale
= nir_bcsel(b
, nir_fge(b
, nir_fabs(b
, t
), huge
),
330 nir_imm_floatN_t(b
, 0.25, bit_size
), one
);
331 nir_ssa_def
*rcp_scaled_t
= nir_frcp(b
, nir_fmul(b
, t
, scale
));
332 nir_ssa_def
*s_over_t
= nir_fmul(b
, nir_fmul(b
, s
, scale
), rcp_scaled_t
);
334 /* For |x| = |y| assume tan = 1 even if infinite (i.e. pretend momentarily
335 * that ∞/∞ = 1) in order to comply with the rather artificial rules
336 * inherited from IEEE 754-2008, namely:
338 * "atan2(±∞, −∞) is ±3π/4
339 * atan2(±∞, +∞) is ±π/4"
341 * Note that this is inconsistent with the rules for the neighborhood of
342 * zero that are based on iterated limits:
344 * "atan2(±0, −0) is ±π
345 * atan2(±0, +0) is ±0"
347 * but GLSL specifically allows implementations to deviate from IEEE rules
348 * at (0,0), so we take that license (i.e. pretend that 0/0 = 1 here as
351 nir_ssa_def
*tan
= nir_bcsel(b
, nir_feq(b
, nir_fabs(b
, x
), nir_fabs(b
, y
)),
352 one
, nir_fabs(b
, s_over_t
));
354 /* Calculate the arctangent and fix up the result if we had flipped the
358 nir_fadd(b
, nir_fmul_imm(b
, nir_b2f(b
, flip
, bit_size
), M_PI_2f
),
361 /* Rather convoluted calculation of the sign of the result. When x < 0 we
362 * cannot use fsign because we need to be able to distinguish between
363 * negative and positive zero. We don't use bitwise arithmetic tricks for
364 * consistency with the GLSL front-end. When x >= 0 rcp_scaled_t will
365 * always be non-negative so this won't be able to distinguish between
366 * negative and positive zero, but we don't care because atan2 is
367 * continuous along the whole positive y = 0 half-line, so it won't affect
368 * the result significantly.
370 return nir_bcsel(b
, nir_flt(b
, nir_fmin(b
, y
, rcp_scaled_t
), zero
),
371 nir_fneg(b
, arc
), arc
);
375 build_frexp16(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
**exponent
)
377 assert(x
->bit_size
== 16);
379 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
380 nir_ssa_def
*zero
= nir_imm_floatN_t(b
, 0, 16);
382 /* Half-precision floating-point values are stored as
387 * An exponent shift of 10 will shift the mantissa out, leaving only the
388 * exponent and sign bit (which itself may be zero, if the absolute value
389 * was taken before the bitcast and shift).
391 nir_ssa_def
*exponent_shift
= nir_imm_int(b
, 10);
392 nir_ssa_def
*exponent_bias
= nir_imm_intN_t(b
, -14, 16);
394 nir_ssa_def
*sign_mantissa_mask
= nir_imm_intN_t(b
, 0x83ffu
, 16);
396 /* Exponent of floating-point values in the range [0.5, 1.0). */
397 nir_ssa_def
*exponent_value
= nir_imm_intN_t(b
, 0x3800u
, 16);
399 nir_ssa_def
*is_not_zero
= nir_fne(b
, abs_x
, zero
);
401 /* Significand return must be of the same type as the input, but the
402 * exponent must be a 32-bit integer.
406 nir_iadd(b
, nir_ushr(b
, abs_x
, exponent_shift
),
407 nir_bcsel(b
, is_not_zero
, exponent_bias
, zero
)));
409 return nir_ior(b
, nir_iand(b
, x
, sign_mantissa_mask
),
410 nir_bcsel(b
, is_not_zero
, exponent_value
, zero
));
414 build_frexp32(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
**exponent
)
416 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
417 nir_ssa_def
*zero
= nir_imm_float(b
, 0.0f
);
419 /* Single-precision floating-point values are stored as
424 * An exponent shift of 23 will shift the mantissa out, leaving only the
425 * exponent and sign bit (which itself may be zero, if the absolute value
426 * was taken before the bitcast and shift.
428 nir_ssa_def
*exponent_shift
= nir_imm_int(b
, 23);
429 nir_ssa_def
*exponent_bias
= nir_imm_int(b
, -126);
431 nir_ssa_def
*sign_mantissa_mask
= nir_imm_int(b
, 0x807fffffu
);
433 /* Exponent of floating-point values in the range [0.5, 1.0). */
434 nir_ssa_def
*exponent_value
= nir_imm_int(b
, 0x3f000000u
);
436 nir_ssa_def
*is_not_zero
= nir_fne(b
, abs_x
, zero
);
439 nir_iadd(b
, nir_ushr(b
, abs_x
, exponent_shift
),
440 nir_bcsel(b
, is_not_zero
, exponent_bias
, zero
));
442 return nir_ior(b
, nir_iand(b
, x
, sign_mantissa_mask
),
443 nir_bcsel(b
, is_not_zero
, exponent_value
, zero
));
447 build_frexp64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
**exponent
)
449 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
450 nir_ssa_def
*zero
= nir_imm_double(b
, 0.0);
451 nir_ssa_def
*zero32
= nir_imm_float(b
, 0.0f
);
453 /* Double-precision floating-point values are stored as
458 * We only need to deal with the exponent so first we extract the upper 32
459 * bits using nir_unpack_64_2x32_split_y.
461 nir_ssa_def
*upper_x
= nir_unpack_64_2x32_split_y(b
, x
);
462 nir_ssa_def
*abs_upper_x
= nir_unpack_64_2x32_split_y(b
, abs_x
);
464 /* An exponent shift of 20 will shift the remaining mantissa bits out,
465 * leaving only the exponent and sign bit (which itself may be zero, if the
466 * absolute value was taken before the bitcast and shift.
468 nir_ssa_def
*exponent_shift
= nir_imm_int(b
, 20);
469 nir_ssa_def
*exponent_bias
= nir_imm_int(b
, -1022);
471 nir_ssa_def
*sign_mantissa_mask
= nir_imm_int(b
, 0x800fffffu
);
473 /* Exponent of floating-point values in the range [0.5, 1.0). */
474 nir_ssa_def
*exponent_value
= nir_imm_int(b
, 0x3fe00000u
);
476 nir_ssa_def
*is_not_zero
= nir_fne(b
, abs_x
, zero
);
479 nir_iadd(b
, nir_ushr(b
, abs_upper_x
, exponent_shift
),
480 nir_bcsel(b
, is_not_zero
, exponent_bias
, zero32
));
482 nir_ssa_def
*new_upper
=
483 nir_ior(b
, nir_iand(b
, upper_x
, sign_mantissa_mask
),
484 nir_bcsel(b
, is_not_zero
, exponent_value
, zero32
));
486 nir_ssa_def
*lower_x
= nir_unpack_64_2x32_split_x(b
, x
);
488 return nir_pack_64_2x32_split(b
, lower_x
, new_upper
);
492 vtn_nir_alu_op_for_spirv_glsl_opcode(struct vtn_builder
*b
,
493 enum GLSLstd450 opcode
)
496 case GLSLstd450Round
: return nir_op_fround_even
;
497 case GLSLstd450RoundEven
: return nir_op_fround_even
;
498 case GLSLstd450Trunc
: return nir_op_ftrunc
;
499 case GLSLstd450FAbs
: return nir_op_fabs
;
500 case GLSLstd450SAbs
: return nir_op_iabs
;
501 case GLSLstd450FSign
: return nir_op_fsign
;
502 case GLSLstd450SSign
: return nir_op_isign
;
503 case GLSLstd450Floor
: return nir_op_ffloor
;
504 case GLSLstd450Ceil
: return nir_op_fceil
;
505 case GLSLstd450Fract
: return nir_op_ffract
;
506 case GLSLstd450Sin
: return nir_op_fsin
;
507 case GLSLstd450Cos
: return nir_op_fcos
;
508 case GLSLstd450Pow
: return nir_op_fpow
;
509 case GLSLstd450Exp2
: return nir_op_fexp2
;
510 case GLSLstd450Log2
: return nir_op_flog2
;
511 case GLSLstd450Sqrt
: return nir_op_fsqrt
;
512 case GLSLstd450InverseSqrt
: return nir_op_frsq
;
513 case GLSLstd450NMin
: return nir_op_fmin
;
514 case GLSLstd450FMin
: return nir_op_fmin
;
515 case GLSLstd450UMin
: return nir_op_umin
;
516 case GLSLstd450SMin
: return nir_op_imin
;
517 case GLSLstd450NMax
: return nir_op_fmax
;
518 case GLSLstd450FMax
: return nir_op_fmax
;
519 case GLSLstd450UMax
: return nir_op_umax
;
520 case GLSLstd450SMax
: return nir_op_imax
;
521 case GLSLstd450FMix
: return nir_op_flrp
;
522 case GLSLstd450Fma
: return nir_op_ffma
;
523 case GLSLstd450Ldexp
: return nir_op_ldexp
;
524 case GLSLstd450FindILsb
: return nir_op_find_lsb
;
525 case GLSLstd450FindSMsb
: return nir_op_ifind_msb
;
526 case GLSLstd450FindUMsb
: return nir_op_ufind_msb
;
528 /* Packing/Unpacking functions */
529 case GLSLstd450PackSnorm4x8
: return nir_op_pack_snorm_4x8
;
530 case GLSLstd450PackUnorm4x8
: return nir_op_pack_unorm_4x8
;
531 case GLSLstd450PackSnorm2x16
: return nir_op_pack_snorm_2x16
;
532 case GLSLstd450PackUnorm2x16
: return nir_op_pack_unorm_2x16
;
533 case GLSLstd450PackHalf2x16
: return nir_op_pack_half_2x16
;
534 case GLSLstd450PackDouble2x32
: return nir_op_pack_64_2x32
;
535 case GLSLstd450UnpackSnorm4x8
: return nir_op_unpack_snorm_4x8
;
536 case GLSLstd450UnpackUnorm4x8
: return nir_op_unpack_unorm_4x8
;
537 case GLSLstd450UnpackSnorm2x16
: return nir_op_unpack_snorm_2x16
;
538 case GLSLstd450UnpackUnorm2x16
: return nir_op_unpack_unorm_2x16
;
539 case GLSLstd450UnpackHalf2x16
: return nir_op_unpack_half_2x16
;
540 case GLSLstd450UnpackDouble2x32
: return nir_op_unpack_64_2x32
;
543 vtn_fail("No NIR equivalent");
547 #define NIR_IMM_FP(n, v) (nir_imm_floatN_t(n, v, src[0]->bit_size))
550 handle_glsl450_alu(struct vtn_builder
*b
, enum GLSLstd450 entrypoint
,
551 const uint32_t *w
, unsigned count
)
553 struct nir_builder
*nb
= &b
->nb
;
554 const struct glsl_type
*dest_type
=
555 vtn_value(b
, w
[1], vtn_value_type_type
)->type
->type
;
557 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
558 val
->ssa
= vtn_create_ssa_value(b
, dest_type
);
560 /* Collect the various SSA sources */
561 unsigned num_inputs
= count
- 5;
562 nir_ssa_def
*src
[3] = { NULL
, };
563 for (unsigned i
= 0; i
< num_inputs
; i
++) {
564 /* These are handled specially below */
565 if (vtn_untyped_value(b
, w
[i
+ 5])->value_type
== vtn_value_type_pointer
)
568 src
[i
] = vtn_ssa_value(b
, w
[i
+ 5])->def
;
571 switch (entrypoint
) {
572 case GLSLstd450Radians
:
573 val
->ssa
->def
= nir_radians(nb
, src
[0]);
575 case GLSLstd450Degrees
:
576 val
->ssa
->def
= nir_degrees(nb
, src
[0]);
579 val
->ssa
->def
= nir_fdiv(nb
, nir_fsin(nb
, src
[0]),
580 nir_fcos(nb
, src
[0]));
583 case GLSLstd450Modf
: {
584 nir_ssa_def
*sign
= nir_fsign(nb
, src
[0]);
585 nir_ssa_def
*abs
= nir_fabs(nb
, src
[0]);
586 val
->ssa
->def
= nir_fmul(nb
, sign
, nir_ffract(nb
, abs
));
587 nir_store_deref(nb
, vtn_nir_deref(b
, w
[6]),
588 nir_fmul(nb
, sign
, nir_ffloor(nb
, abs
)), 0xf);
592 case GLSLstd450ModfStruct
: {
593 nir_ssa_def
*sign
= nir_fsign(nb
, src
[0]);
594 nir_ssa_def
*abs
= nir_fabs(nb
, src
[0]);
595 vtn_assert(glsl_type_is_struct(val
->ssa
->type
));
596 val
->ssa
->elems
[0]->def
= nir_fmul(nb
, sign
, nir_ffract(nb
, abs
));
597 val
->ssa
->elems
[1]->def
= nir_fmul(nb
, sign
, nir_ffloor(nb
, abs
));
602 val
->ssa
->def
= nir_sge(nb
, src
[1], src
[0]);
605 case GLSLstd450Length
:
606 val
->ssa
->def
= nir_fast_length(nb
, src
[0]);
608 case GLSLstd450Distance
:
609 val
->ssa
->def
= nir_fast_distance(nb
, src
[0], src
[1]);
611 case GLSLstd450Normalize
:
612 val
->ssa
->def
= nir_fast_normalize(nb
, src
[0]);
616 val
->ssa
->def
= build_exp(nb
, src
[0]);
620 val
->ssa
->def
= build_log(nb
, src
[0]);
623 case GLSLstd450FClamp
:
624 case GLSLstd450NClamp
:
625 val
->ssa
->def
= nir_fclamp(nb
, src
[0], src
[1], src
[2]);
627 case GLSLstd450UClamp
:
628 val
->ssa
->def
= nir_uclamp(nb
, src
[0], src
[1], src
[2]);
630 case GLSLstd450SClamp
:
631 val
->ssa
->def
= nir_iclamp(nb
, src
[0], src
[1], src
[2]);
634 case GLSLstd450Cross
: {
635 val
->ssa
->def
= nir_cross(nb
, src
[0], src
[1]);
639 case GLSLstd450SmoothStep
: {
640 val
->ssa
->def
= nir_smoothstep(nb
, src
[0], src
[1], src
[2]);
644 case GLSLstd450FaceForward
:
646 nir_bcsel(nb
, nir_flt(nb
, nir_fdot(nb
, src
[2], src
[1]),
647 NIR_IMM_FP(nb
, 0.0)),
648 src
[0], nir_fneg(nb
, src
[0]));
651 case GLSLstd450Reflect
:
652 /* I - 2 * dot(N, I) * N */
654 nir_fsub(nb
, src
[0], nir_fmul(nb
, NIR_IMM_FP(nb
, 2.0),
655 nir_fmul(nb
, nir_fdot(nb
, src
[0], src
[1]),
659 case GLSLstd450Refract
: {
660 nir_ssa_def
*I
= src
[0];
661 nir_ssa_def
*N
= src
[1];
662 nir_ssa_def
*eta
= src
[2];
663 nir_ssa_def
*n_dot_i
= nir_fdot(nb
, N
, I
);
664 nir_ssa_def
*one
= NIR_IMM_FP(nb
, 1.0);
665 nir_ssa_def
*zero
= NIR_IMM_FP(nb
, 0.0);
666 /* According to the SPIR-V and GLSL specs, eta is always a float
667 * regardless of the type of the other operands. However in practice it
668 * seems that if you try to pass it a float then glslang will just
669 * promote it to a double and generate invalid SPIR-V. In order to
670 * support a hypothetical fixed version of glslang we’ll promote eta to
671 * double if the other operands are double also.
673 if (I
->bit_size
!= eta
->bit_size
) {
674 nir_op conversion_op
=
675 nir_type_conversion_op(nir_type_float
| eta
->bit_size
,
676 nir_type_float
| I
->bit_size
,
677 nir_rounding_mode_undef
);
678 eta
= nir_build_alu(nb
, conversion_op
, eta
, NULL
, NULL
, NULL
);
680 /* k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I)) */
682 nir_fsub(nb
, one
, nir_fmul(nb
, eta
, nir_fmul(nb
, eta
,
683 nir_fsub(nb
, one
, nir_fmul(nb
, n_dot_i
, n_dot_i
)))));
684 nir_ssa_def
*result
=
685 nir_fsub(nb
, nir_fmul(nb
, eta
, I
),
686 nir_fmul(nb
, nir_fadd(nb
, nir_fmul(nb
, eta
, n_dot_i
),
687 nir_fsqrt(nb
, k
)), N
));
688 /* XXX: bcsel, or if statement? */
689 val
->ssa
->def
= nir_bcsel(nb
, nir_flt(nb
, k
, zero
), zero
, result
);
694 /* 0.5 * (e^x - e^(-x)) */
696 nir_fmul_imm(nb
, nir_fsub(nb
, build_exp(nb
, src
[0]),
697 build_exp(nb
, nir_fneg(nb
, src
[0]))),
702 /* 0.5 * (e^x + e^(-x)) */
704 nir_fmul_imm(nb
, nir_fadd(nb
, build_exp(nb
, src
[0]),
705 build_exp(nb
, nir_fneg(nb
, src
[0]))),
709 case GLSLstd450Tanh
: {
710 /* tanh(x) := (0.5 * (e^x - e^(-x))) / (0.5 * (e^x + e^(-x)))
712 * With a little algebra this reduces to (e^2x - 1) / (e^2x + 1)
714 * We clamp x to (-inf, +10] to avoid precision problems. When x > 10,
715 * e^2x is so much larger than 1.0 that 1.0 gets flushed to zero in the
716 * computation e^2x +/- 1 so it can be ignored.
718 * For 16-bit precision we clamp x to (-inf, +4.2] since the maximum
719 * representable number is only 65,504 and e^(2*6) exceeds that. Also,
720 * if x > 4.2, tanh(x) will return 1.0 in fp16.
722 const uint32_t bit_size
= src
[0]->bit_size
;
723 const double clamped_x
= bit_size
> 16 ? 10.0 : 4.2;
724 nir_ssa_def
*x
= nir_fmin(nb
, src
[0],
725 nir_imm_floatN_t(nb
, clamped_x
, bit_size
));
726 nir_ssa_def
*exp2x
= build_exp(nb
, nir_fmul_imm(nb
, x
, 2.0));
727 val
->ssa
->def
= nir_fdiv(nb
, nir_fadd_imm(nb
, exp2x
, -1.0),
728 nir_fadd_imm(nb
, exp2x
, 1.0));
732 case GLSLstd450Asinh
:
733 val
->ssa
->def
= nir_fmul(nb
, nir_fsign(nb
, src
[0]),
734 build_log(nb
, nir_fadd(nb
, nir_fabs(nb
, src
[0]),
735 nir_fsqrt(nb
, nir_fadd_imm(nb
, nir_fmul(nb
, src
[0], src
[0]),
738 case GLSLstd450Acosh
:
739 val
->ssa
->def
= build_log(nb
, nir_fadd(nb
, src
[0],
740 nir_fsqrt(nb
, nir_fadd_imm(nb
, nir_fmul(nb
, src
[0], src
[0]),
743 case GLSLstd450Atanh
: {
744 nir_ssa_def
*one
= nir_imm_floatN_t(nb
, 1.0, src
[0]->bit_size
);
746 nir_fmul_imm(nb
, build_log(nb
, nir_fdiv(nb
, nir_fadd(nb
, src
[0], one
),
747 nir_fsub(nb
, one
, src
[0]))),
753 val
->ssa
->def
= build_asin(nb
, src
[0], 0.086566724, -0.03102955);
758 nir_fsub(nb
, nir_imm_floatN_t(nb
, M_PI_2f
, src
[0]->bit_size
),
759 build_asin(nb
, src
[0], 0.08132463, -0.02363318));
763 val
->ssa
->def
= build_atan(nb
, src
[0]);
766 case GLSLstd450Atan2
:
767 val
->ssa
->def
= build_atan2(nb
, src
[0], src
[1]);
770 case GLSLstd450Frexp
: {
771 nir_ssa_def
*exponent
;
772 if (src
[0]->bit_size
== 64)
773 val
->ssa
->def
= build_frexp64(nb
, src
[0], &exponent
);
774 else if (src
[0]->bit_size
== 32)
775 val
->ssa
->def
= build_frexp32(nb
, src
[0], &exponent
);
777 val
->ssa
->def
= build_frexp16(nb
, src
[0], &exponent
);
778 nir_store_deref(nb
, vtn_nir_deref(b
, w
[6]), exponent
, 0xf);
782 case GLSLstd450FrexpStruct
: {
783 vtn_assert(glsl_type_is_struct(val
->ssa
->type
));
784 if (src
[0]->bit_size
== 64)
785 val
->ssa
->elems
[0]->def
= build_frexp64(nb
, src
[0],
786 &val
->ssa
->elems
[1]->def
);
787 else if (src
[0]->bit_size
== 32)
788 val
->ssa
->elems
[0]->def
= build_frexp32(nb
, src
[0],
789 &val
->ssa
->elems
[1]->def
);
791 val
->ssa
->elems
[0]->def
= build_frexp16(nb
, src
[0],
792 &val
->ssa
->elems
[1]->def
);
798 nir_build_alu(&b
->nb
,
799 vtn_nir_alu_op_for_spirv_glsl_opcode(b
, entrypoint
),
800 src
[0], src
[1], src
[2], NULL
);
806 handle_glsl450_interpolation(struct vtn_builder
*b
, enum GLSLstd450 opcode
,
807 const uint32_t *w
, unsigned count
)
809 const struct glsl_type
*dest_type
=
810 vtn_value(b
, w
[1], vtn_value_type_type
)->type
->type
;
812 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
813 val
->ssa
= vtn_create_ssa_value(b
, dest_type
);
817 case GLSLstd450InterpolateAtCentroid
:
818 op
= nir_intrinsic_interp_deref_at_centroid
;
820 case GLSLstd450InterpolateAtSample
:
821 op
= nir_intrinsic_interp_deref_at_sample
;
823 case GLSLstd450InterpolateAtOffset
:
824 op
= nir_intrinsic_interp_deref_at_offset
;
827 vtn_fail("Invalid opcode");
830 nir_intrinsic_instr
*intrin
= nir_intrinsic_instr_create(b
->nb
.shader
, op
);
832 struct vtn_pointer
*ptr
=
833 vtn_value(b
, w
[5], vtn_value_type_pointer
)->pointer
;
834 nir_deref_instr
*deref
= vtn_pointer_to_deref(b
, ptr
);
836 /* If the value we are interpolating has an index into a vector then
837 * interpolate the vector and index the result of that instead. This is
838 * necessary because the index will get generated as a series of nir_bcsel
839 * instructions so it would no longer be an input variable.
841 const bool vec_array_deref
= deref
->deref_type
== nir_deref_type_array
&&
842 glsl_type_is_vector(nir_deref_instr_parent(deref
)->type
);
844 nir_deref_instr
*vec_deref
= NULL
;
845 if (vec_array_deref
) {
847 deref
= nir_deref_instr_parent(deref
);
849 intrin
->src
[0] = nir_src_for_ssa(&deref
->dest
.ssa
);
852 case GLSLstd450InterpolateAtCentroid
:
854 case GLSLstd450InterpolateAtSample
:
855 case GLSLstd450InterpolateAtOffset
:
856 intrin
->src
[1] = nir_src_for_ssa(vtn_ssa_value(b
, w
[6])->def
);
859 vtn_fail("Invalid opcode");
862 intrin
->num_components
= glsl_get_vector_elements(deref
->type
);
863 nir_ssa_dest_init(&intrin
->instr
, &intrin
->dest
,
864 glsl_get_vector_elements(deref
->type
),
865 glsl_get_bit_size(deref
->type
), NULL
);
867 nir_builder_instr_insert(&b
->nb
, &intrin
->instr
);
869 if (vec_array_deref
) {
871 nir_const_value
*const_index
= nir_src_as_const_value(vec_deref
->arr
.index
);
873 val
->ssa
->def
= vtn_vector_extract(b
, &intrin
->dest
.ssa
,
874 const_index
->u32
[0]);
876 val
->ssa
->def
= vtn_vector_extract_dynamic(b
, &intrin
->dest
.ssa
,
877 vec_deref
->arr
.index
.ssa
);
880 val
->ssa
->def
= &intrin
->dest
.ssa
;
885 vtn_handle_glsl450_instruction(struct vtn_builder
*b
, SpvOp ext_opcode
,
886 const uint32_t *w
, unsigned count
)
888 switch ((enum GLSLstd450
)ext_opcode
) {
889 case GLSLstd450Determinant
: {
890 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
891 val
->ssa
= rzalloc(b
, struct vtn_ssa_value
);
892 val
->ssa
->type
= vtn_value(b
, w
[1], vtn_value_type_type
)->type
->type
;
893 val
->ssa
->def
= build_mat_det(b
, vtn_ssa_value(b
, w
[5]));
897 case GLSLstd450MatrixInverse
: {
898 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
899 val
->ssa
= matrix_inverse(b
, vtn_ssa_value(b
, w
[5]));
903 case GLSLstd450InterpolateAtCentroid
:
904 case GLSLstd450InterpolateAtSample
:
905 case GLSLstd450InterpolateAtOffset
:
906 handle_glsl450_interpolation(b
, ext_opcode
, w
, count
);
910 handle_glsl450_alu(b
, (enum GLSLstd450
)ext_opcode
, w
, count
);