2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Jason Ekstrand (jason@jlekstrand.net)
29 #include "vtn_private.h"
30 #include "GLSL.std.450.h"
32 #define M_PIf ((float) M_PI)
33 #define M_PI_2f ((float) M_PI_2)
34 #define M_PI_4f ((float) M_PI_4)
37 build_mat2_det(nir_builder
*b
, nir_ssa_def
*col
[2])
39 unsigned swiz
[4] = {1, 0, 0, 0};
40 nir_ssa_def
*p
= nir_fmul(b
, col
[0], nir_swizzle(b
, col
[1], swiz
, 2, true));
41 return nir_fsub(b
, nir_channel(b
, p
, 0), nir_channel(b
, p
, 1));
45 build_mat3_det(nir_builder
*b
, nir_ssa_def
*col
[3])
47 unsigned yzx
[4] = {1, 2, 0, 0};
48 unsigned zxy
[4] = {2, 0, 1, 0};
52 nir_fmul(b
, nir_swizzle(b
, col
[1], yzx
, 3, true),
53 nir_swizzle(b
, col
[2], zxy
, 3, true)));
56 nir_fmul(b
, nir_swizzle(b
, col
[1], zxy
, 3, true),
57 nir_swizzle(b
, col
[2], yzx
, 3, true)));
59 nir_ssa_def
*diff
= nir_fsub(b
, prod0
, prod1
);
61 return nir_fadd(b
, nir_channel(b
, diff
, 0),
62 nir_fadd(b
, nir_channel(b
, diff
, 1),
63 nir_channel(b
, diff
, 2)));
67 build_mat4_det(nir_builder
*b
, nir_ssa_def
**col
)
69 nir_ssa_def
*subdet
[4];
70 for (unsigned i
= 0; i
< 4; i
++) {
72 for (unsigned j
= 0; j
< 3; j
++)
73 swiz
[j
] = j
+ (j
>= i
);
75 nir_ssa_def
*subcol
[3];
76 subcol
[0] = nir_swizzle(b
, col
[1], swiz
, 3, true);
77 subcol
[1] = nir_swizzle(b
, col
[2], swiz
, 3, true);
78 subcol
[2] = nir_swizzle(b
, col
[3], swiz
, 3, true);
80 subdet
[i
] = build_mat3_det(b
, subcol
);
83 nir_ssa_def
*prod
= nir_fmul(b
, col
[0], nir_vec(b
, subdet
, 4));
85 return nir_fadd(b
, nir_fsub(b
, nir_channel(b
, prod
, 0),
86 nir_channel(b
, prod
, 1)),
87 nir_fsub(b
, nir_channel(b
, prod
, 2),
88 nir_channel(b
, prod
, 3)));
92 build_mat_det(struct vtn_builder
*b
, struct vtn_ssa_value
*src
)
94 unsigned size
= glsl_get_vector_elements(src
->type
);
97 for (unsigned i
= 0; i
< size
; i
++)
98 cols
[i
] = src
->elems
[i
]->def
;
101 case 2: return build_mat2_det(&b
->nb
, cols
);
102 case 3: return build_mat3_det(&b
->nb
, cols
);
103 case 4: return build_mat4_det(&b
->nb
, cols
);
105 vtn_fail("Invalid matrix size");
109 /* Computes the determinate of the submatrix given by taking src and
110 * removing the specified row and column.
113 build_mat_subdet(struct nir_builder
*b
, struct vtn_ssa_value
*src
,
114 unsigned size
, unsigned row
, unsigned col
)
116 assert(row
< size
&& col
< size
);
118 return nir_channel(b
, src
->elems
[1 - col
]->def
, 1 - row
);
120 /* Swizzle to get all but the specified row */
122 for (unsigned j
= 0; j
< 3; j
++)
123 swiz
[j
] = j
+ (j
>= row
);
125 /* Grab all but the specified column */
126 nir_ssa_def
*subcol
[3];
127 for (unsigned j
= 0; j
< size
; j
++) {
129 subcol
[j
- (j
> col
)] = nir_swizzle(b
, src
->elems
[j
]->def
,
130 swiz
, size
- 1, true);
135 return build_mat2_det(b
, subcol
);
138 return build_mat3_det(b
, subcol
);
143 static struct vtn_ssa_value
*
144 matrix_inverse(struct vtn_builder
*b
, struct vtn_ssa_value
*src
)
146 nir_ssa_def
*adj_col
[4];
147 unsigned size
= glsl_get_vector_elements(src
->type
);
149 /* Build up an adjugate matrix */
150 for (unsigned c
= 0; c
< size
; c
++) {
151 nir_ssa_def
*elem
[4];
152 for (unsigned r
= 0; r
< size
; r
++) {
153 elem
[r
] = build_mat_subdet(&b
->nb
, src
, size
, c
, r
);
156 elem
[r
] = nir_fneg(&b
->nb
, elem
[r
]);
159 adj_col
[c
] = nir_vec(&b
->nb
, elem
, size
);
162 nir_ssa_def
*det_inv
= nir_frcp(&b
->nb
, build_mat_det(b
, src
));
164 struct vtn_ssa_value
*val
= vtn_create_ssa_value(b
, src
->type
);
165 for (unsigned i
= 0; i
< size
; i
++)
166 val
->elems
[i
]->def
= nir_fmul(&b
->nb
, adj_col
[i
], det_inv
);
172 build_length(nir_builder
*b
, nir_ssa_def
*vec
)
174 switch (vec
->num_components
) {
175 case 1: return nir_fsqrt(b
, nir_fmul(b
, vec
, vec
));
176 case 2: return nir_fsqrt(b
, nir_fdot2(b
, vec
, vec
));
177 case 3: return nir_fsqrt(b
, nir_fdot3(b
, vec
, vec
));
178 case 4: return nir_fsqrt(b
, nir_fdot4(b
, vec
, vec
));
180 unreachable("Invalid number of components");
184 static inline nir_ssa_def
*
185 build_fclamp(nir_builder
*b
,
186 nir_ssa_def
*x
, nir_ssa_def
*min_val
, nir_ssa_def
*max_val
)
188 return nir_fmin(b
, nir_fmax(b
, x
, min_val
), max_val
);
195 build_exp(nir_builder
*b
, nir_ssa_def
*x
)
197 return nir_fexp2(b
, nir_fmul(b
, x
, nir_imm_float(b
, M_LOG2E
)));
201 * Return ln(x) - the natural logarithm of x.
204 build_log(nir_builder
*b
, nir_ssa_def
*x
)
206 return nir_fmul(b
, nir_flog2(b
, x
), nir_imm_float(b
, 1.0 / M_LOG2E
));
210 * Approximate asin(x) by the formula:
211 * asin~(x) = sign(x) * (pi/2 - sqrt(1 - |x|) * (pi/2 + |x|(pi/4 - 1 + |x|(p0 + |x|p1))))
213 * which is correct to first order at x=0 and x=±1 regardless of the p
214 * coefficients but can be made second-order correct at both ends by selecting
215 * the fit coefficients appropriately. Different p coefficients can be used
216 * in the asin and acos implementation to minimize some relative error metric
220 build_asin(nir_builder
*b
, nir_ssa_def
*x
, float p0
, float p1
)
222 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
223 return nir_fmul(b
, nir_fsign(b
, x
),
224 nir_fsub(b
, nir_imm_float(b
, M_PI_2f
),
225 nir_fmul(b
, nir_fsqrt(b
, nir_fsub(b
, nir_imm_float(b
, 1.0f
), abs_x
)),
226 nir_fadd(b
, nir_imm_float(b
, M_PI_2f
),
228 nir_fadd(b
, nir_imm_float(b
, M_PI_4f
- 1.0f
),
230 nir_fadd(b
, nir_imm_float(b
, p0
),
232 nir_imm_float(b
, p1
))))))))));
236 * Compute xs[0] + xs[1] + xs[2] + ... using fadd.
239 build_fsum(nir_builder
*b
, nir_ssa_def
**xs
, int terms
)
241 nir_ssa_def
*accum
= xs
[0];
243 for (int i
= 1; i
< terms
; i
++)
244 accum
= nir_fadd(b
, accum
, xs
[i
]);
250 build_atan(nir_builder
*b
, nir_ssa_def
*y_over_x
)
252 nir_ssa_def
*abs_y_over_x
= nir_fabs(b
, y_over_x
);
253 nir_ssa_def
*one
= nir_imm_float(b
, 1.0f
);
256 * range-reduction, first step:
258 * / y_over_x if |y_over_x| <= 1.0;
260 * \ 1.0 / y_over_x otherwise
262 nir_ssa_def
*x
= nir_fdiv(b
, nir_fmin(b
, abs_y_over_x
, one
),
263 nir_fmax(b
, abs_y_over_x
, one
));
266 * approximate atan by evaluating polynomial:
268 * x * 0.9999793128310355 - x^3 * 0.3326756418091246 +
269 * x^5 * 0.1938924977115610 - x^7 * 0.1173503194786851 +
270 * x^9 * 0.0536813784310406 - x^11 * 0.0121323213173444
272 nir_ssa_def
*x_2
= nir_fmul(b
, x
, x
);
273 nir_ssa_def
*x_3
= nir_fmul(b
, x_2
, x
);
274 nir_ssa_def
*x_5
= nir_fmul(b
, x_3
, x_2
);
275 nir_ssa_def
*x_7
= nir_fmul(b
, x_5
, x_2
);
276 nir_ssa_def
*x_9
= nir_fmul(b
, x_7
, x_2
);
277 nir_ssa_def
*x_11
= nir_fmul(b
, x_9
, x_2
);
279 nir_ssa_def
*polynomial_terms
[] = {
280 nir_fmul(b
, x
, nir_imm_float(b
, 0.9999793128310355f
)),
281 nir_fmul(b
, x_3
, nir_imm_float(b
, -0.3326756418091246f
)),
282 nir_fmul(b
, x_5
, nir_imm_float(b
, 0.1938924977115610f
)),
283 nir_fmul(b
, x_7
, nir_imm_float(b
, -0.1173503194786851f
)),
284 nir_fmul(b
, x_9
, nir_imm_float(b
, 0.0536813784310406f
)),
285 nir_fmul(b
, x_11
, nir_imm_float(b
, -0.0121323213173444f
)),
289 build_fsum(b
, polynomial_terms
, ARRAY_SIZE(polynomial_terms
));
291 /* range-reduction fixup */
292 tmp
= nir_fadd(b
, tmp
,
294 nir_b2f(b
, nir_flt(b
, one
, abs_y_over_x
)),
295 nir_fadd(b
, nir_fmul(b
, tmp
,
296 nir_imm_float(b
, -2.0f
)),
297 nir_imm_float(b
, M_PI_2f
))));
300 return nir_fmul(b
, tmp
, nir_fsign(b
, y_over_x
));
304 build_atan2(nir_builder
*b
, nir_ssa_def
*y
, nir_ssa_def
*x
)
306 nir_ssa_def
*zero
= nir_imm_float(b
, 0);
307 nir_ssa_def
*one
= nir_imm_float(b
, 1);
309 /* If we're on the left half-plane rotate the coordinates π/2 clock-wise
310 * for the y=0 discontinuity to end up aligned with the vertical
311 * discontinuity of atan(s/t) along t=0. This also makes sure that we
312 * don't attempt to divide by zero along the vertical line, which may give
313 * unspecified results on non-GLSL 4.1-capable hardware.
315 nir_ssa_def
*flip
= nir_fge(b
, zero
, x
);
316 nir_ssa_def
*s
= nir_bcsel(b
, flip
, nir_fabs(b
, x
), y
);
317 nir_ssa_def
*t
= nir_bcsel(b
, flip
, y
, nir_fabs(b
, x
));
319 /* If the magnitude of the denominator exceeds some huge value, scale down
320 * the arguments in order to prevent the reciprocal operation from flushing
321 * its result to zero, which would cause precision problems, and for s
322 * infinite would cause us to return a NaN instead of the correct finite
325 * If fmin and fmax are respectively the smallest and largest positive
326 * normalized floating point values representable by the implementation,
327 * the constants below should be in agreement with:
330 * scale <= 1 / fmin / fmax (for |t| >= huge)
332 * In addition scale should be a negative power of two in order to avoid
333 * loss of precision. The values chosen below should work for most usual
334 * floating point representations with at least the dynamic range of ATI's
335 * 24-bit representation.
337 nir_ssa_def
*huge
= nir_imm_float(b
, 1e18f
);
338 nir_ssa_def
*scale
= nir_bcsel(b
, nir_fge(b
, nir_fabs(b
, t
), huge
),
339 nir_imm_float(b
, 0.25), one
);
340 nir_ssa_def
*rcp_scaled_t
= nir_frcp(b
, nir_fmul(b
, t
, scale
));
341 nir_ssa_def
*s_over_t
= nir_fmul(b
, nir_fmul(b
, s
, scale
), rcp_scaled_t
);
343 /* For |x| = |y| assume tan = 1 even if infinite (i.e. pretend momentarily
344 * that ∞/∞ = 1) in order to comply with the rather artificial rules
345 * inherited from IEEE 754-2008, namely:
347 * "atan2(±∞, −∞) is ±3π/4
348 * atan2(±∞, +∞) is ±π/4"
350 * Note that this is inconsistent with the rules for the neighborhood of
351 * zero that are based on iterated limits:
353 * "atan2(±0, −0) is ±π
354 * atan2(±0, +0) is ±0"
356 * but GLSL specifically allows implementations to deviate from IEEE rules
357 * at (0,0), so we take that license (i.e. pretend that 0/0 = 1 here as
360 nir_ssa_def
*tan
= nir_bcsel(b
, nir_feq(b
, nir_fabs(b
, x
), nir_fabs(b
, y
)),
361 one
, nir_fabs(b
, s_over_t
));
363 /* Calculate the arctangent and fix up the result if we had flipped the
366 nir_ssa_def
*arc
= nir_fadd(b
, nir_fmul(b
, nir_b2f(b
, flip
),
367 nir_imm_float(b
, M_PI_2f
)),
370 /* Rather convoluted calculation of the sign of the result. When x < 0 we
371 * cannot use fsign because we need to be able to distinguish between
372 * negative and positive zero. We don't use bitwise arithmetic tricks for
373 * consistency with the GLSL front-end. When x >= 0 rcp_scaled_t will
374 * always be non-negative so this won't be able to distinguish between
375 * negative and positive zero, but we don't care because atan2 is
376 * continuous along the whole positive y = 0 half-line, so it won't affect
377 * the result significantly.
379 return nir_bcsel(b
, nir_flt(b
, nir_fmin(b
, y
, rcp_scaled_t
), zero
),
380 nir_fneg(b
, arc
), arc
);
384 build_frexp32(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
**exponent
)
386 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
387 nir_ssa_def
*zero
= nir_imm_float(b
, 0.0f
);
389 /* Single-precision floating-point values are stored as
394 * An exponent shift of 23 will shift the mantissa out, leaving only the
395 * exponent and sign bit (which itself may be zero, if the absolute value
396 * was taken before the bitcast and shift.
398 nir_ssa_def
*exponent_shift
= nir_imm_int(b
, 23);
399 nir_ssa_def
*exponent_bias
= nir_imm_int(b
, -126);
401 nir_ssa_def
*sign_mantissa_mask
= nir_imm_int(b
, 0x807fffffu
);
403 /* Exponent of floating-point values in the range [0.5, 1.0). */
404 nir_ssa_def
*exponent_value
= nir_imm_int(b
, 0x3f000000u
);
406 nir_ssa_def
*is_not_zero
= nir_fne(b
, abs_x
, zero
);
409 nir_iadd(b
, nir_ushr(b
, abs_x
, exponent_shift
),
410 nir_bcsel(b
, is_not_zero
, exponent_bias
, zero
));
412 return nir_ior(b
, nir_iand(b
, x
, sign_mantissa_mask
),
413 nir_bcsel(b
, is_not_zero
, exponent_value
, zero
));
417 build_frexp64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
**exponent
)
419 nir_ssa_def
*abs_x
= nir_fabs(b
, x
);
420 nir_ssa_def
*zero
= nir_imm_double(b
, 0.0);
421 nir_ssa_def
*zero32
= nir_imm_float(b
, 0.0f
);
423 /* Double-precision floating-point values are stored as
428 * We only need to deal with the exponent so first we extract the upper 32
429 * bits using nir_unpack_64_2x32_split_y.
431 nir_ssa_def
*upper_x
= nir_unpack_64_2x32_split_y(b
, x
);
432 nir_ssa_def
*abs_upper_x
= nir_unpack_64_2x32_split_y(b
, abs_x
);
434 /* An exponent shift of 20 will shift the remaining mantissa bits out,
435 * leaving only the exponent and sign bit (which itself may be zero, if the
436 * absolute value was taken before the bitcast and shift.
438 nir_ssa_def
*exponent_shift
= nir_imm_int(b
, 20);
439 nir_ssa_def
*exponent_bias
= nir_imm_int(b
, -1022);
441 nir_ssa_def
*sign_mantissa_mask
= nir_imm_int(b
, 0x800fffffu
);
443 /* Exponent of floating-point values in the range [0.5, 1.0). */
444 nir_ssa_def
*exponent_value
= nir_imm_int(b
, 0x3fe00000u
);
446 nir_ssa_def
*is_not_zero
= nir_fne(b
, abs_x
, zero
);
449 nir_iadd(b
, nir_ushr(b
, abs_upper_x
, exponent_shift
),
450 nir_bcsel(b
, is_not_zero
, exponent_bias
, zero32
));
452 nir_ssa_def
*new_upper
=
453 nir_ior(b
, nir_iand(b
, upper_x
, sign_mantissa_mask
),
454 nir_bcsel(b
, is_not_zero
, exponent_value
, zero32
));
456 nir_ssa_def
*lower_x
= nir_unpack_64_2x32_split_x(b
, x
);
458 return nir_pack_64_2x32_split(b
, lower_x
, new_upper
);
462 vtn_nir_alu_op_for_spirv_glsl_opcode(struct vtn_builder
*b
,
463 enum GLSLstd450 opcode
)
466 case GLSLstd450Round
: return nir_op_fround_even
;
467 case GLSLstd450RoundEven
: return nir_op_fround_even
;
468 case GLSLstd450Trunc
: return nir_op_ftrunc
;
469 case GLSLstd450FAbs
: return nir_op_fabs
;
470 case GLSLstd450SAbs
: return nir_op_iabs
;
471 case GLSLstd450FSign
: return nir_op_fsign
;
472 case GLSLstd450SSign
: return nir_op_isign
;
473 case GLSLstd450Floor
: return nir_op_ffloor
;
474 case GLSLstd450Ceil
: return nir_op_fceil
;
475 case GLSLstd450Fract
: return nir_op_ffract
;
476 case GLSLstd450Sin
: return nir_op_fsin
;
477 case GLSLstd450Cos
: return nir_op_fcos
;
478 case GLSLstd450Pow
: return nir_op_fpow
;
479 case GLSLstd450Exp2
: return nir_op_fexp2
;
480 case GLSLstd450Log2
: return nir_op_flog2
;
481 case GLSLstd450Sqrt
: return nir_op_fsqrt
;
482 case GLSLstd450InverseSqrt
: return nir_op_frsq
;
483 case GLSLstd450NMin
: return nir_op_fmin
;
484 case GLSLstd450FMin
: return nir_op_fmin
;
485 case GLSLstd450UMin
: return nir_op_umin
;
486 case GLSLstd450SMin
: return nir_op_imin
;
487 case GLSLstd450NMax
: return nir_op_fmax
;
488 case GLSLstd450FMax
: return nir_op_fmax
;
489 case GLSLstd450UMax
: return nir_op_umax
;
490 case GLSLstd450SMax
: return nir_op_imax
;
491 case GLSLstd450FMix
: return nir_op_flrp
;
492 case GLSLstd450Fma
: return nir_op_ffma
;
493 case GLSLstd450Ldexp
: return nir_op_ldexp
;
494 case GLSLstd450FindILsb
: return nir_op_find_lsb
;
495 case GLSLstd450FindSMsb
: return nir_op_ifind_msb
;
496 case GLSLstd450FindUMsb
: return nir_op_ufind_msb
;
498 /* Packing/Unpacking functions */
499 case GLSLstd450PackSnorm4x8
: return nir_op_pack_snorm_4x8
;
500 case GLSLstd450PackUnorm4x8
: return nir_op_pack_unorm_4x8
;
501 case GLSLstd450PackSnorm2x16
: return nir_op_pack_snorm_2x16
;
502 case GLSLstd450PackUnorm2x16
: return nir_op_pack_unorm_2x16
;
503 case GLSLstd450PackHalf2x16
: return nir_op_pack_half_2x16
;
504 case GLSLstd450PackDouble2x32
: return nir_op_pack_64_2x32
;
505 case GLSLstd450UnpackSnorm4x8
: return nir_op_unpack_snorm_4x8
;
506 case GLSLstd450UnpackUnorm4x8
: return nir_op_unpack_unorm_4x8
;
507 case GLSLstd450UnpackSnorm2x16
: return nir_op_unpack_snorm_2x16
;
508 case GLSLstd450UnpackUnorm2x16
: return nir_op_unpack_unorm_2x16
;
509 case GLSLstd450UnpackHalf2x16
: return nir_op_unpack_half_2x16
;
510 case GLSLstd450UnpackDouble2x32
: return nir_op_unpack_64_2x32
;
513 vtn_fail("No NIR equivalent");
517 #define NIR_IMM_FP(n, v) (src[0]->bit_size == 64 ? nir_imm_double(n, v) : nir_imm_float(n, v))
520 handle_glsl450_alu(struct vtn_builder
*b
, enum GLSLstd450 entrypoint
,
521 const uint32_t *w
, unsigned count
)
523 struct nir_builder
*nb
= &b
->nb
;
524 const struct glsl_type
*dest_type
=
525 vtn_value(b
, w
[1], vtn_value_type_type
)->type
->type
;
527 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
528 val
->ssa
= vtn_create_ssa_value(b
, dest_type
);
530 /* Collect the various SSA sources */
531 unsigned num_inputs
= count
- 5;
532 nir_ssa_def
*src
[3] = { NULL
, };
533 for (unsigned i
= 0; i
< num_inputs
; i
++) {
534 /* These are handled specially below */
535 if (vtn_untyped_value(b
, w
[i
+ 5])->value_type
== vtn_value_type_pointer
)
538 src
[i
] = vtn_ssa_value(b
, w
[i
+ 5])->def
;
541 switch (entrypoint
) {
542 case GLSLstd450Radians
:
543 val
->ssa
->def
= nir_fmul(nb
, src
[0], nir_imm_float(nb
, 0.01745329251));
545 case GLSLstd450Degrees
:
546 val
->ssa
->def
= nir_fmul(nb
, src
[0], nir_imm_float(nb
, 57.2957795131));
549 val
->ssa
->def
= nir_fdiv(nb
, nir_fsin(nb
, src
[0]),
550 nir_fcos(nb
, src
[0]));
553 case GLSLstd450Modf
: {
554 nir_ssa_def
*sign
= nir_fsign(nb
, src
[0]);
555 nir_ssa_def
*abs
= nir_fabs(nb
, src
[0]);
556 val
->ssa
->def
= nir_fmul(nb
, sign
, nir_ffract(nb
, abs
));
557 nir_store_deref_var(nb
, vtn_nir_deref(b
, w
[6]),
558 nir_fmul(nb
, sign
, nir_ffloor(nb
, abs
)), 0xf);
562 case GLSLstd450ModfStruct
: {
563 nir_ssa_def
*sign
= nir_fsign(nb
, src
[0]);
564 nir_ssa_def
*abs
= nir_fabs(nb
, src
[0]);
565 vtn_assert(glsl_type_is_struct(val
->ssa
->type
));
566 val
->ssa
->elems
[0]->def
= nir_fmul(nb
, sign
, nir_ffract(nb
, abs
));
567 val
->ssa
->elems
[1]->def
= nir_fmul(nb
, sign
, nir_ffloor(nb
, abs
));
572 val
->ssa
->def
= nir_sge(nb
, src
[1], src
[0]);
575 case GLSLstd450Length
:
576 val
->ssa
->def
= build_length(nb
, src
[0]);
578 case GLSLstd450Distance
:
579 val
->ssa
->def
= build_length(nb
, nir_fsub(nb
, src
[0], src
[1]));
581 case GLSLstd450Normalize
:
582 val
->ssa
->def
= nir_fdiv(nb
, src
[0], build_length(nb
, src
[0]));
586 val
->ssa
->def
= build_exp(nb
, src
[0]);
590 val
->ssa
->def
= build_log(nb
, src
[0]);
593 case GLSLstd450FClamp
:
594 case GLSLstd450NClamp
:
595 val
->ssa
->def
= build_fclamp(nb
, src
[0], src
[1], src
[2]);
597 case GLSLstd450UClamp
:
598 val
->ssa
->def
= nir_umin(nb
, nir_umax(nb
, src
[0], src
[1]), src
[2]);
600 case GLSLstd450SClamp
:
601 val
->ssa
->def
= nir_imin(nb
, nir_imax(nb
, src
[0], src
[1]), src
[2]);
604 case GLSLstd450Cross
: {
605 unsigned yzx
[4] = { 1, 2, 0, 0 };
606 unsigned zxy
[4] = { 2, 0, 1, 0 };
608 nir_fsub(nb
, nir_fmul(nb
, nir_swizzle(nb
, src
[0], yzx
, 3, true),
609 nir_swizzle(nb
, src
[1], zxy
, 3, true)),
610 nir_fmul(nb
, nir_swizzle(nb
, src
[0], zxy
, 3, true),
611 nir_swizzle(nb
, src
[1], yzx
, 3, true)));
615 case GLSLstd450SmoothStep
: {
616 /* t = clamp((x - edge0) / (edge1 - edge0), 0, 1) */
618 build_fclamp(nb
, nir_fdiv(nb
, nir_fsub(nb
, src
[2], src
[0]),
619 nir_fsub(nb
, src
[1], src
[0])),
620 NIR_IMM_FP(nb
, 0.0), NIR_IMM_FP(nb
, 1.0));
621 /* result = t * t * (3 - 2 * t) */
623 nir_fmul(nb
, t
, nir_fmul(nb
, t
,
624 nir_fsub(nb
, NIR_IMM_FP(nb
, 3.0),
625 nir_fmul(nb
, NIR_IMM_FP(nb
, 2.0), t
))));
629 case GLSLstd450FaceForward
:
631 nir_bcsel(nb
, nir_flt(nb
, nir_fdot(nb
, src
[2], src
[1]),
632 nir_imm_float(nb
, 0.0)),
633 src
[0], nir_fneg(nb
, src
[0]));
636 case GLSLstd450Reflect
:
637 /* I - 2 * dot(N, I) * N */
639 nir_fsub(nb
, src
[0], nir_fmul(nb
, nir_imm_float(nb
, 2.0),
640 nir_fmul(nb
, nir_fdot(nb
, src
[0], src
[1]),
644 case GLSLstd450Refract
: {
645 nir_ssa_def
*I
= src
[0];
646 nir_ssa_def
*N
= src
[1];
647 nir_ssa_def
*eta
= src
[2];
648 nir_ssa_def
*n_dot_i
= nir_fdot(nb
, N
, I
);
649 nir_ssa_def
*one
= nir_imm_float(nb
, 1.0);
650 nir_ssa_def
*zero
= nir_imm_float(nb
, 0.0);
651 /* k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I)) */
653 nir_fsub(nb
, one
, nir_fmul(nb
, eta
, nir_fmul(nb
, eta
,
654 nir_fsub(nb
, one
, nir_fmul(nb
, n_dot_i
, n_dot_i
)))));
655 nir_ssa_def
*result
=
656 nir_fsub(nb
, nir_fmul(nb
, eta
, I
),
657 nir_fmul(nb
, nir_fadd(nb
, nir_fmul(nb
, eta
, n_dot_i
),
658 nir_fsqrt(nb
, k
)), N
));
659 /* XXX: bcsel, or if statement? */
660 val
->ssa
->def
= nir_bcsel(nb
, nir_flt(nb
, k
, zero
), zero
, result
);
665 /* 0.5 * (e^x - e^(-x)) */
667 nir_fmul(nb
, nir_imm_float(nb
, 0.5f
),
668 nir_fsub(nb
, build_exp(nb
, src
[0]),
669 build_exp(nb
, nir_fneg(nb
, src
[0]))));
673 /* 0.5 * (e^x + e^(-x)) */
675 nir_fmul(nb
, nir_imm_float(nb
, 0.5f
),
676 nir_fadd(nb
, build_exp(nb
, src
[0]),
677 build_exp(nb
, nir_fneg(nb
, src
[0]))));
680 case GLSLstd450Tanh
: {
681 /* tanh(x) := (0.5 * (e^x - e^(-x))) / (0.5 * (e^x + e^(-x)))
683 * With a little algebra this reduces to (e^2x - 1) / (e^2x + 1)
685 * We clamp x to (-inf, +10] to avoid precision problems. When x > 10,
686 * e^2x is so much larger than 1.0 that 1.0 gets flushed to zero in the
687 * computation e^2x +/- 1 so it can be ignored.
689 nir_ssa_def
*x
= nir_fmin(nb
, src
[0], nir_imm_float(nb
, 10));
690 nir_ssa_def
*exp2x
= build_exp(nb
, nir_fmul(nb
, x
, nir_imm_float(nb
, 2)));
691 val
->ssa
->def
= nir_fdiv(nb
, nir_fsub(nb
, exp2x
, nir_imm_float(nb
, 1)),
692 nir_fadd(nb
, exp2x
, nir_imm_float(nb
, 1)));
696 case GLSLstd450Asinh
:
697 val
->ssa
->def
= nir_fmul(nb
, nir_fsign(nb
, src
[0]),
698 build_log(nb
, nir_fadd(nb
, nir_fabs(nb
, src
[0]),
699 nir_fsqrt(nb
, nir_fadd(nb
, nir_fmul(nb
, src
[0], src
[0]),
700 nir_imm_float(nb
, 1.0f
))))));
702 case GLSLstd450Acosh
:
703 val
->ssa
->def
= build_log(nb
, nir_fadd(nb
, src
[0],
704 nir_fsqrt(nb
, nir_fsub(nb
, nir_fmul(nb
, src
[0], src
[0]),
705 nir_imm_float(nb
, 1.0f
)))));
707 case GLSLstd450Atanh
: {
708 nir_ssa_def
*one
= nir_imm_float(nb
, 1.0);
709 val
->ssa
->def
= nir_fmul(nb
, nir_imm_float(nb
, 0.5f
),
710 build_log(nb
, nir_fdiv(nb
, nir_fadd(nb
, one
, src
[0]),
711 nir_fsub(nb
, one
, src
[0]))));
716 val
->ssa
->def
= build_asin(nb
, src
[0], 0.086566724, -0.03102955);
720 val
->ssa
->def
= nir_fsub(nb
, nir_imm_float(nb
, M_PI_2f
),
721 build_asin(nb
, src
[0], 0.08132463, -0.02363318));
725 val
->ssa
->def
= build_atan(nb
, src
[0]);
728 case GLSLstd450Atan2
:
729 val
->ssa
->def
= build_atan2(nb
, src
[0], src
[1]);
732 case GLSLstd450Frexp
: {
733 nir_ssa_def
*exponent
;
734 if (src
[0]->bit_size
== 64)
735 val
->ssa
->def
= build_frexp64(nb
, src
[0], &exponent
);
737 val
->ssa
->def
= build_frexp32(nb
, src
[0], &exponent
);
738 nir_store_deref_var(nb
, vtn_nir_deref(b
, w
[6]), exponent
, 0xf);
742 case GLSLstd450FrexpStruct
: {
743 vtn_assert(glsl_type_is_struct(val
->ssa
->type
));
744 if (src
[0]->bit_size
== 64)
745 val
->ssa
->elems
[0]->def
= build_frexp64(nb
, src
[0],
746 &val
->ssa
->elems
[1]->def
);
748 val
->ssa
->elems
[0]->def
= build_frexp32(nb
, src
[0],
749 &val
->ssa
->elems
[1]->def
);
755 nir_build_alu(&b
->nb
,
756 vtn_nir_alu_op_for_spirv_glsl_opcode(b
, entrypoint
),
757 src
[0], src
[1], src
[2], NULL
);
763 handle_glsl450_interpolation(struct vtn_builder
*b
, enum GLSLstd450 opcode
,
764 const uint32_t *w
, unsigned count
)
766 const struct glsl_type
*dest_type
=
767 vtn_value(b
, w
[1], vtn_value_type_type
)->type
->type
;
769 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
770 val
->ssa
= vtn_create_ssa_value(b
, dest_type
);
774 case GLSLstd450InterpolateAtCentroid
:
775 op
= nir_intrinsic_interp_var_at_centroid
;
777 case GLSLstd450InterpolateAtSample
:
778 op
= nir_intrinsic_interp_var_at_sample
;
780 case GLSLstd450InterpolateAtOffset
:
781 op
= nir_intrinsic_interp_var_at_offset
;
784 vtn_fail("Invalid opcode");
787 nir_intrinsic_instr
*intrin
= nir_intrinsic_instr_create(b
->nb
.shader
, op
);
789 nir_deref_var
*deref
= vtn_nir_deref(b
, w
[5]);
790 intrin
->variables
[0] = nir_deref_var_clone(deref
, intrin
);
793 case GLSLstd450InterpolateAtCentroid
:
795 case GLSLstd450InterpolateAtSample
:
796 case GLSLstd450InterpolateAtOffset
:
797 intrin
->src
[0] = nir_src_for_ssa(vtn_ssa_value(b
, w
[6])->def
);
800 vtn_fail("Invalid opcode");
803 intrin
->num_components
= glsl_get_vector_elements(dest_type
);
804 nir_ssa_dest_init(&intrin
->instr
, &intrin
->dest
,
805 glsl_get_vector_elements(dest_type
),
806 glsl_get_bit_size(dest_type
), NULL
);
807 val
->ssa
->def
= &intrin
->dest
.ssa
;
809 nir_builder_instr_insert(&b
->nb
, &intrin
->instr
);
813 vtn_handle_glsl450_instruction(struct vtn_builder
*b
, uint32_t ext_opcode
,
814 const uint32_t *w
, unsigned count
)
816 switch ((enum GLSLstd450
)ext_opcode
) {
817 case GLSLstd450Determinant
: {
818 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
819 val
->ssa
= rzalloc(b
, struct vtn_ssa_value
);
820 val
->ssa
->type
= vtn_value(b
, w
[1], vtn_value_type_type
)->type
->type
;
821 val
->ssa
->def
= build_mat_det(b
, vtn_ssa_value(b
, w
[5]));
825 case GLSLstd450MatrixInverse
: {
826 struct vtn_value
*val
= vtn_push_value(b
, w
[2], vtn_value_type_ssa
);
827 val
->ssa
= matrix_inverse(b
, vtn_ssa_value(b
, w
[5]));
831 case GLSLstd450InterpolateAtCentroid
:
832 case GLSLstd450InterpolateAtSample
:
833 case GLSLstd450InterpolateAtOffset
:
834 handle_glsl450_interpolation(b
, ext_opcode
, w
, count
);
838 handle_glsl450_alu(b
, (enum GLSLstd450
)ext_opcode
, w
, count
);