2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
31 using namespace ir_builder
;
34 * A visitor that lowers built-in floating-point pack/unpack expressions
37 class lower_packing_builtins_visitor
: public ir_rvalue_visitor
{
40 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
42 explicit lower_packing_builtins_visitor(int op_mask
)
46 /* Mutually exclusive options. */
47 assert(!((op_mask
& LOWER_PACK_HALF_2x16
) &&
48 (op_mask
& LOWER_PACK_HALF_2x16_TO_SPLIT
)));
50 assert(!((op_mask
& LOWER_UNPACK_HALF_2x16
) &&
51 (op_mask
& LOWER_UNPACK_HALF_2x16_TO_SPLIT
)));
53 factory
.instructions
= &factory_instructions
;
56 virtual ~lower_packing_builtins_visitor()
58 assert(factory_instructions
.is_empty());
61 bool get_progress() { return progress
; }
63 void handle_rvalue(ir_rvalue
**rvalue
)
68 ir_expression
*expr
= (*rvalue
)->as_expression();
72 enum lower_packing_builtins_op lowering_op
=
73 choose_lowering_op(expr
->operation
);
75 if (lowering_op
== LOWER_PACK_UNPACK_NONE
)
78 setup_factory(ralloc_parent(expr
));
80 ir_rvalue
*op0
= expr
->operands
[0];
81 ralloc_steal(factory
.mem_ctx
, op0
);
83 switch (lowering_op
) {
84 case LOWER_PACK_SNORM_2x16
:
85 *rvalue
= lower_pack_snorm_2x16(op0
);
87 case LOWER_PACK_SNORM_4x8
:
88 *rvalue
= lower_pack_snorm_4x8(op0
);
90 case LOWER_PACK_UNORM_2x16
:
91 *rvalue
= lower_pack_unorm_2x16(op0
);
93 case LOWER_PACK_UNORM_4x8
:
94 *rvalue
= lower_pack_unorm_4x8(op0
);
96 case LOWER_PACK_HALF_2x16
:
97 *rvalue
= lower_pack_half_2x16(op0
);
99 case LOWER_PACK_HALF_2x16_TO_SPLIT
:
100 *rvalue
= split_pack_half_2x16(op0
);
102 case LOWER_UNPACK_SNORM_2x16
:
103 *rvalue
= lower_unpack_snorm_2x16(op0
);
105 case LOWER_UNPACK_SNORM_4x8
:
106 *rvalue
= lower_unpack_snorm_4x8(op0
);
108 case LOWER_UNPACK_UNORM_2x16
:
109 *rvalue
= lower_unpack_unorm_2x16(op0
);
111 case LOWER_UNPACK_UNORM_4x8
:
112 *rvalue
= lower_unpack_unorm_4x8(op0
);
114 case LOWER_UNPACK_HALF_2x16
:
115 *rvalue
= lower_unpack_half_2x16(op0
);
117 case LOWER_UNPACK_HALF_2x16_TO_SPLIT
:
118 *rvalue
= split_unpack_half_2x16(op0
);
120 case LOWER_PACK_UNPACK_NONE
:
121 assert(!"not reached");
133 exec_list factory_instructions
;
136 * Determine the needed lowering operation by filtering \a expr_op
137 * through \ref op_mask.
139 enum lower_packing_builtins_op
140 choose_lowering_op(ir_expression_operation expr_op
)
142 /* C++ regards int and enum as fundamentally different types.
143 * So, we can't simply return from each case; we must cast the return
149 case ir_unop_pack_snorm_2x16
:
150 result
= op_mask
& LOWER_PACK_SNORM_2x16
;
152 case ir_unop_pack_snorm_4x8
:
153 result
= op_mask
& LOWER_PACK_SNORM_4x8
;
155 case ir_unop_pack_unorm_2x16
:
156 result
= op_mask
& LOWER_PACK_UNORM_2x16
;
158 case ir_unop_pack_unorm_4x8
:
159 result
= op_mask
& LOWER_PACK_UNORM_4x8
;
161 case ir_unop_pack_half_2x16
:
162 result
= op_mask
& (LOWER_PACK_HALF_2x16
| LOWER_PACK_HALF_2x16_TO_SPLIT
);
164 case ir_unop_unpack_snorm_2x16
:
165 result
= op_mask
& LOWER_UNPACK_SNORM_2x16
;
167 case ir_unop_unpack_snorm_4x8
:
168 result
= op_mask
& LOWER_UNPACK_SNORM_4x8
;
170 case ir_unop_unpack_unorm_2x16
:
171 result
= op_mask
& LOWER_UNPACK_UNORM_2x16
;
173 case ir_unop_unpack_unorm_4x8
:
174 result
= op_mask
& LOWER_UNPACK_UNORM_4x8
;
176 case ir_unop_unpack_half_2x16
:
177 result
= op_mask
& (LOWER_UNPACK_HALF_2x16
| LOWER_UNPACK_HALF_2x16_TO_SPLIT
);
180 result
= LOWER_PACK_UNPACK_NONE
;
184 return static_cast<enum lower_packing_builtins_op
>(result
);
188 setup_factory(void *mem_ctx
)
190 assert(factory
.mem_ctx
== NULL
);
191 assert(factory
.instructions
->is_empty());
193 factory
.mem_ctx
= mem_ctx
;
199 base_ir
->insert_before(factory
.instructions
);
200 assert(factory
.instructions
->is_empty());
201 factory
.mem_ctx
= NULL
;
204 template <typename T
>
208 return factory
.constant(x
);
212 * \brief Pack two uint16's into a single uint32.
214 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
215 * where the least significant bits specify the first element of the pair.
219 pack_uvec2_to_uint(ir_rvalue
*uvec2_rval
)
221 assert(uvec2_rval
->type
== glsl_type::uvec2_type
);
223 /* uvec2 u = UVEC2_RVAL; */
224 ir_variable
*u
= factory
.make_temp(glsl_type::uvec2_type
,
225 "tmp_pack_uvec2_to_uint");
226 factory
.emit(assign(u
, uvec2_rval
));
228 /* return (u.y << 16) | (u.x & 0xffff); */
229 return bit_or(lshift(swizzle_y(u
), constant(16u)),
230 bit_and(swizzle_x(u
), constant(0xffffu
)));
234 * \brief Pack four uint8's into a single uint32.
236 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
237 * uint32 where the least significant bits specify the first element of the
238 * 4-tuple. Return the uint32.
241 pack_uvec4_to_uint(ir_rvalue
*uvec4_rval
)
243 assert(uvec4_rval
->type
== glsl_type::uvec4_type
);
245 /* uvec4 u = UVEC4_RVAL; */
246 ir_variable
*u
= factory
.make_temp(glsl_type::uvec4_type
,
247 "tmp_pack_uvec4_to_uint");
248 factory
.emit(assign(u
, bit_and(uvec4_rval
, constant(0xffu
))));
250 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
251 return bit_or(bit_or(lshift(swizzle_w(u
), constant(24u)),
252 lshift(swizzle_z(u
), constant(16u))),
253 bit_or(lshift(swizzle_y(u
), constant(8u)),
258 * \brief Unpack a uint32 into two uint16's.
260 * Interpret the given uint32 as a uint16 pair where the uint32's least
261 * significant bits specify the pair's first element. Return the uint16
265 unpack_uint_to_uvec2(ir_rvalue
*uint_rval
)
267 assert(uint_rval
->type
== glsl_type::uint_type
);
269 /* uint u = UINT_RVAL; */
270 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
271 "tmp_unpack_uint_to_uvec2_u");
272 factory
.emit(assign(u
, uint_rval
));
275 ir_variable
*u2
= factory
.make_temp(glsl_type::uvec2_type
,
276 "tmp_unpack_uint_to_uvec2_u2");
278 /* u2.x = u & 0xffffu; */
279 factory
.emit(assign(u2
, bit_and(u
, constant(0xffffu
)), WRITEMASK_X
));
281 /* u2.y = u >> 16u; */
282 factory
.emit(assign(u2
, rshift(u
, constant(16u)), WRITEMASK_Y
));
284 return deref(u2
).val
;
288 * \brief Unpack a uint32 into four uint8's.
290 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
291 * significant bits specify the 4-tuple's first element. Return the uint8
292 * 4-tuple as a uvec4.
295 unpack_uint_to_uvec4(ir_rvalue
*uint_rval
)
297 assert(uint_rval
->type
== glsl_type::uint_type
);
299 /* uint u = UINT_RVAL; */
300 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
301 "tmp_unpack_uint_to_uvec4_u");
302 factory
.emit(assign(u
, uint_rval
));
305 ir_variable
*u4
= factory
.make_temp(glsl_type::uvec4_type
,
306 "tmp_unpack_uint_to_uvec4_u4");
308 /* u4.x = u & 0xffu; */
309 factory
.emit(assign(u4
, bit_and(u
, constant(0xffu
)), WRITEMASK_X
));
311 /* u4.y = (u >> 8u) & 0xffu; */
312 factory
.emit(assign(u4
, bit_and(rshift(u
, constant(8u)),
313 constant(0xffu
)), WRITEMASK_Y
));
315 /* u4.z = (u >> 16u) & 0xffu; */
316 factory
.emit(assign(u4
, bit_and(rshift(u
, constant(16u)),
317 constant(0xffu
)), WRITEMASK_Z
));
319 /* u4.w = (u >> 24u) */
320 factory
.emit(assign(u4
, rshift(u
, constant(24u)), WRITEMASK_W
));
322 return deref(u4
).val
;
326 * \brief Lower a packSnorm2x16 expression.
328 * \param vec2_rval is packSnorm2x16's input
329 * \return packSnorm2x16's output as a uint rvalue
332 lower_pack_snorm_2x16(ir_rvalue
*vec2_rval
)
334 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
336 * highp uint packSnorm2x16(vec2 v)
337 * --------------------------------
338 * First, converts each component of the normalized floating-point value
339 * v into 16-bit integer values. Then, the results are packed into the
340 * returned 32-bit unsigned integer.
342 * The conversion for component c of v to fixed point is done as
345 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
347 * The first component of the vector will be written to the least
348 * significant bits of the output; the last component will be written to
349 * the most significant bits.
351 * This function generates IR that approximates the following pseudo-GLSL:
353 * return pack_uvec2_to_uint(
355 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
357 * It is necessary to first convert the vec2 to ivec2 rather than directly
358 * converting vec2 to uvec2 because the latter conversion is undefined.
359 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
360 * convert a negative floating point value to an uint".
362 assert(vec2_rval
->type
== glsl_type::vec2_type
);
364 ir_rvalue
*result
= pack_uvec2_to_uint(
365 i2u(f2i(round_even(mul(clamp(vec2_rval
,
368 constant(32767.0f
))))));
370 assert(result
->type
== glsl_type::uint_type
);
375 * \brief Lower a packSnorm4x8 expression.
377 * \param vec4_rval is packSnorm4x8's input
378 * \return packSnorm4x8's output as a uint rvalue
381 lower_pack_snorm_4x8(ir_rvalue
*vec4_rval
)
383 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
385 * highp uint packSnorm4x8(vec4 v)
386 * -------------------------------
387 * First, converts each component of the normalized floating-point value
388 * v into 8-bit integer values. Then, the results are packed into the
389 * returned 32-bit unsigned integer.
391 * The conversion for component c of v to fixed point is done as
394 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
396 * The first component of the vector will be written to the least
397 * significant bits of the output; the last component will be written to
398 * the most significant bits.
400 * This function generates IR that approximates the following pseudo-GLSL:
402 * return pack_uvec4_to_uint(
404 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
406 * It is necessary to first convert the vec4 to ivec4 rather than directly
407 * converting vec4 to uvec4 because the latter conversion is undefined.
408 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
409 * convert a negative floating point value to an uint".
411 assert(vec4_rval
->type
== glsl_type::vec4_type
);
413 ir_rvalue
*result
= pack_uvec4_to_uint(
414 i2u(f2i(round_even(mul(clamp(vec4_rval
,
417 constant(127.0f
))))));
419 assert(result
->type
== glsl_type::uint_type
);
424 * \brief Lower an unpackSnorm2x16 expression.
426 * \param uint_rval is unpackSnorm2x16's input
427 * \return unpackSnorm2x16's output as a vec2 rvalue
430 lower_unpack_snorm_2x16(ir_rvalue
*uint_rval
)
432 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
434 * highp vec2 unpackSnorm2x16 (highp uint p)
435 * -----------------------------------------
436 * First, unpacks a single 32-bit unsigned integer p into a pair of
437 * 16-bit unsigned integers. Then, each component is converted to
438 * a normalized floating-point value to generate the returned
439 * two-component vector.
441 * The conversion for unpacked fixed-point value f to floating point is
444 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
446 * The first component of the returned vector will be extracted from the
447 * least significant bits of the input; the last component will be
448 * extracted from the most significant bits.
450 * This function generates IR that approximates the following pseudo-GLSL:
453 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
456 * The above IR may appear unnecessarily complex, but the intermediate
457 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
460 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
461 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
462 * place that int16 into an int32, which results in the *positive* integer
463 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
464 * unimportant bit 16. We must now extend the int16's sign bit into bits
465 * 17-32, which is accomplished by left-shifting then right-shifting.
468 assert(uint_rval
->type
== glsl_type::uint_type
);
471 clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval
)),
478 assert(result
->type
== glsl_type::vec2_type
);
483 * \brief Lower an unpackSnorm4x8 expression.
485 * \param uint_rval is unpackSnorm4x8's input
486 * \return unpackSnorm4x8's output as a vec4 rvalue
489 lower_unpack_snorm_4x8(ir_rvalue
*uint_rval
)
491 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
493 * highp vec4 unpackSnorm4x8 (highp uint p)
494 * ----------------------------------------
495 * First, unpacks a single 32-bit unsigned integer p into four
496 * 8-bit unsigned integers. Then, each component is converted to
497 * a normalized floating-point value to generate the returned
498 * four-component vector.
500 * The conversion for unpacked fixed-point value f to floating point is
503 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
505 * The first component of the returned vector will be extracted from the
506 * least significant bits of the input; the last component will be
507 * extracted from the most significant bits.
509 * This function generates IR that approximates the following pseudo-GLSL:
512 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
515 * The above IR may appear unnecessarily complex, but the intermediate
516 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
519 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
520 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
521 * place that int8 into an int32, which results in the *positive* integer
522 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
523 * unimportant bit 8. We must now extend the int8's sign bit into bits
524 * 9-32, which is accomplished by left-shifting then right-shifting.
527 assert(uint_rval
->type
== glsl_type::uint_type
);
530 clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval
)),
537 assert(result
->type
== glsl_type::vec4_type
);
542 * \brief Lower a packUnorm2x16 expression.
544 * \param vec2_rval is packUnorm2x16's input
545 * \return packUnorm2x16's output as a uint rvalue
548 lower_pack_unorm_2x16(ir_rvalue
*vec2_rval
)
550 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
552 * highp uint packUnorm2x16 (vec2 v)
553 * ---------------------------------
554 * First, converts each component of the normalized floating-point value
555 * v into 16-bit integer values. Then, the results are packed into the
556 * returned 32-bit unsigned integer.
558 * The conversion for component c of v to fixed point is done as
561 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
563 * The first component of the vector will be written to the least
564 * significant bits of the output; the last component will be written to
565 * the most significant bits.
567 * This function generates IR that approximates the following pseudo-GLSL:
569 * return pack_uvec2_to_uint(uvec2(
570 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
572 * Here it is safe to directly convert the vec2 to uvec2 because the the
573 * vec2 has been clamped to a non-negative range.
576 assert(vec2_rval
->type
== glsl_type::vec2_type
);
578 ir_rvalue
*result
= pack_uvec2_to_uint(
579 f2u(round_even(mul(saturate(vec2_rval
), constant(65535.0f
)))));
581 assert(result
->type
== glsl_type::uint_type
);
586 * \brief Lower a packUnorm4x8 expression.
588 * \param vec4_rval is packUnorm4x8's input
589 * \return packUnorm4x8's output as a uint rvalue
592 lower_pack_unorm_4x8(ir_rvalue
*vec4_rval
)
594 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
596 * highp uint packUnorm4x8 (vec4 v)
597 * --------------------------------
598 * First, converts each component of the normalized floating-point value
599 * v into 8-bit integer values. Then, the results are packed into the
600 * returned 32-bit unsigned integer.
602 * The conversion for component c of v to fixed point is done as
605 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
607 * The first component of the vector will be written to the least
608 * significant bits of the output; the last component will be written to
609 * the most significant bits.
611 * This function generates IR that approximates the following pseudo-GLSL:
613 * return pack_uvec4_to_uint(uvec4(
614 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
616 * Here it is safe to directly convert the vec4 to uvec4 because the the
617 * vec4 has been clamped to a non-negative range.
620 assert(vec4_rval
->type
== glsl_type::vec4_type
);
622 ir_rvalue
*result
= pack_uvec4_to_uint(
623 f2u(round_even(mul(saturate(vec4_rval
), constant(255.0f
)))));
625 assert(result
->type
== glsl_type::uint_type
);
630 * \brief Lower an unpackUnorm2x16 expression.
632 * \param uint_rval is unpackUnorm2x16's input
633 * \return unpackUnorm2x16's output as a vec2 rvalue
636 lower_unpack_unorm_2x16(ir_rvalue
*uint_rval
)
638 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
640 * highp vec2 unpackUnorm2x16 (highp uint p)
641 * -----------------------------------------
642 * First, unpacks a single 32-bit unsigned integer p into a pair of
643 * 16-bit unsigned integers. Then, each component is converted to
644 * a normalized floating-point value to generate the returned
645 * two-component vector.
647 * The conversion for unpacked fixed-point value f to floating point is
650 * unpackUnorm2x16: f / 65535.0
652 * The first component of the returned vector will be extracted from the
653 * least significant bits of the input; the last component will be
654 * extracted from the most significant bits.
656 * This function generates IR that approximates the following pseudo-GLSL:
658 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
661 assert(uint_rval
->type
== glsl_type::uint_type
);
663 ir_rvalue
*result
= div(u2f(unpack_uint_to_uvec2(uint_rval
)),
666 assert(result
->type
== glsl_type::vec2_type
);
671 * \brief Lower an unpackUnorm4x8 expression.
673 * \param uint_rval is unpackUnorm4x8's input
674 * \return unpackUnorm4x8's output as a vec4 rvalue
677 lower_unpack_unorm_4x8(ir_rvalue
*uint_rval
)
679 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
681 * highp vec4 unpackUnorm4x8 (highp uint p)
682 * ----------------------------------------
683 * First, unpacks a single 32-bit unsigned integer p into four
684 * 8-bit unsigned integers. Then, each component is converted to
685 * a normalized floating-point value to generate the returned
686 * two-component vector.
688 * The conversion for unpacked fixed-point value f to floating point is
691 * unpackUnorm4x8: f / 255.0
693 * The first component of the returned vector will be extracted from the
694 * least significant bits of the input; the last component will be
695 * extracted from the most significant bits.
697 * This function generates IR that approximates the following pseudo-GLSL:
699 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
702 assert(uint_rval
->type
== glsl_type::uint_type
);
704 ir_rvalue
*result
= div(u2f(unpack_uint_to_uvec4(uint_rval
)),
707 assert(result
->type
== glsl_type::vec4_type
);
712 * \brief Lower the component-wise calculation of packHalf2x16.
714 * \param f_rval is one component of packHafl2x16's input
715 * \param e_rval is the unshifted exponent bits of f_rval
716 * \param m_rval is the unshifted mantissa bits of f_rval
718 * \return a uint rvalue that encodes a float16 in its lower 16 bits
721 pack_half_1x16_nosign(ir_rvalue
*f_rval
,
725 assert(e_rval
->type
== glsl_type::uint_type
);
726 assert(m_rval
->type
== glsl_type::uint_type
);
729 ir_variable
*u16
= factory
.make_temp(glsl_type::uint_type
,
730 "tmp_pack_half_1x16_u16");
732 /* float f = FLOAT_RVAL; */
733 ir_variable
*f
= factory
.make_temp(glsl_type::float_type
,
734 "tmp_pack_half_1x16_f");
735 factory
.emit(assign(f
, f_rval
));
737 /* uint e = E_RVAL; */
738 ir_variable
*e
= factory
.make_temp(glsl_type::uint_type
,
739 "tmp_pack_half_1x16_e");
740 factory
.emit(assign(e
, e_rval
));
742 /* uint m = M_RVAL; */
743 ir_variable
*m
= factory
.make_temp(glsl_type::uint_type
,
744 "tmp_pack_half_1x16_m");
745 factory
.emit(assign(m
, m_rval
));
750 * For a float16, the bit layout is:
756 * Let f16 be a float16 value. The sign, exponent, and mantissa
757 * determine its value thus:
759 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
760 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
761 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
762 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
763 * if e16 = 31 and m16 != 0, then NaN (5)
765 * where 0 <= m16 < 2^10.
767 * For a float32, the bit layout is:
773 * Let f32 be a float32 value. The sign, exponent, and mantissa
774 * determine its value thus:
776 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
777 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
778 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
779 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
780 * if e32 = 255 and m32 != 0, then NaN (14)
782 * where 0 <= m32 < 2^23.
784 * The minimum and maximum normal float16 values are
786 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
787 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
789 * The step at max_norm16 is
791 * max_step16 = 2^5 (22)
793 * Observe that the float16 boundary values in equations 20-21 lie in the
794 * range of normal float32 values.
799 * Not all float32 values can be exactly represented as a float16. We
800 * round all such intermediate float32 values to the nearest float16; if
801 * the float32 is exactly between to float16 values, we round to the one
802 * with an even mantissa. This rounding behavior has several benefits:
804 * - It has no sign bias.
806 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
809 * - By reproducing the behavior of the GPU (at least on Intel hardware),
810 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
811 * result in the same value as if the expression were executed on the
816 * Our task is to compute s16, e16, m16 given f32. Since this function
817 * ignores the sign bit, assume that s32 = s16 = 0. There are several
823 /* Case 1) f32 is NaN
825 * The resultant f16 will also be NaN.
828 /* if (e32 == 255 && m32 != 0) { */
829 if_tree(logic_and(equal(e
, constant(0xffu
<< 23u)),
830 logic_not(equal(m
, constant(0u)))),
832 assign(u16
, constant(0x7fffu
)),
834 /* Case 2) f32 lies in the range [0, min_norm16).
836 * The resultant float16 will be either zero, subnormal, or normal.
840 * f32 = min_norm16 (30)
844 * e32 = 113 and m32 = 0 (31)
846 * Therefore this case occurs if and only if
851 /* } else if (e32 < 113) { */
852 if_tree(less(e
, constant(113u << 23u)),
854 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
855 assign(u16
, f2u(round_even(mul(expr(ir_unop_abs
, f
),
856 constant((float) (1 << 24)))))),
858 /* Case 3) f32 lies in the range
859 * [min_norm16, max_norm16 + max_step16).
861 * The resultant float16 will be either normal or infinite.
865 * f32 = max_norm16 + max_step16 (40)
866 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
870 * e32 = 143 and m32 = 0 (43)
872 * We already solved the boundary condition f32 = min_norm16 above
873 * in equation 31. Therefore this case occurs if and only if
875 * 113 <= e32 and e32 < 143
878 /* } else if (e32 < 143) { */
879 if_tree(less(e
, constant(143u << 23u)),
881 /* The addition below handles the case where the mantissa rounds
882 * up to 1024 and bumps the exponent.
884 * u16 = ((e - (112u << 23u)) >> 13u)
885 * + round_to_even((float(m) / (1u << 13u));
887 assign(u16
, add(rshift(sub(e
, constant(112u << 23u)),
890 div(u2f(m
), constant((float) (1 << 13))))))),
892 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
894 * The resultant float16 will be infinite.
896 * The cases above caught all float32 values in the range
897 * [0, max_norm16 + max_step16), so this is the fall-through case.
902 assign(u16
, constant(31u << 10u))))));
906 return deref(u16
).val
;
910 * \brief Lower a packHalf2x16 expression.
912 * \param vec2_rval is packHalf2x16's input
913 * \return packHalf2x16's output as a uint rvalue
916 lower_pack_half_2x16(ir_rvalue
*vec2_rval
)
918 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
920 * highp uint packHalf2x16 (mediump vec2 v)
921 * ----------------------------------------
922 * Returns an unsigned integer obtained by converting the components of
923 * a two-component floating-point vector to the 16-bit floating-point
924 * representation found in the OpenGL ES Specification, and then packing
925 * these two 16-bit integers into a 32-bit unsigned integer.
927 * The first vector component specifies the 16 least- significant bits
928 * of the result; the second component specifies the 16 most-significant
932 assert(vec2_rval
->type
== glsl_type::vec2_type
);
934 /* vec2 f = VEC2_RVAL; */
935 ir_variable
*f
= factory
.make_temp(glsl_type::vec2_type
,
936 "tmp_pack_half_2x16_f");
937 factory
.emit(assign(f
, vec2_rval
));
939 /* uvec2 f32 = bitcast_f2u(f); */
940 ir_variable
*f32
= factory
.make_temp(glsl_type::uvec2_type
,
941 "tmp_pack_half_2x16_f32");
942 factory
.emit(assign(f32
, expr(ir_unop_bitcast_f2u
, f
)));
945 ir_variable
*f16
= factory
.make_temp(glsl_type::uvec2_type
,
946 "tmp_pack_half_2x16_f16");
948 /* Get f32's unshifted exponent bits.
950 * uvec2 e = f32 & 0x7f800000u;
952 ir_variable
*e
= factory
.make_temp(glsl_type::uvec2_type
,
953 "tmp_pack_half_2x16_e");
954 factory
.emit(assign(e
, bit_and(f32
, constant(0x7f800000u
))));
956 /* Get f32's unshifted mantissa bits.
958 * uvec2 m = f32 & 0x007fffffu;
960 ir_variable
*m
= factory
.make_temp(glsl_type::uvec2_type
,
961 "tmp_pack_half_2x16_m");
962 factory
.emit(assign(m
, bit_and(f32
, constant(0x007fffffu
))));
964 /* Set f16's exponent and mantissa bits.
966 * f16.x = pack_half_1x16_nosign(e.x, m.x);
967 * f16.y = pack_half_1y16_nosign(e.y, m.y);
969 factory
.emit(assign(f16
, pack_half_1x16_nosign(swizzle_x(f
),
973 factory
.emit(assign(f16
, pack_half_1x16_nosign(swizzle_y(f
),
978 /* Set f16's sign bits.
980 * f16 |= (f32 & (1u << 31u) >> 16u;
983 assign(f16
, bit_or(f16
,
984 rshift(bit_and(f32
, constant(1u << 31u)),
988 /* return (f16.y << 16u) | f16.x; */
989 ir_rvalue
*result
= bit_or(lshift(swizzle_y(f16
),
993 assert(result
->type
== glsl_type::uint_type
);
998 * \brief Split packHalf2x16's vec2 operand into two floats.
1000 * \param vec2_rval is packHalf2x16's input
1001 * \return a uint rvalue
1003 * Some code generators, such as the i965 fragment shader, require that all
1004 * vector expressions be lowered to a sequence of scalar expressions.
1005 * However, packHalf2x16 cannot be scalarized by the same mechanism as
1006 * a true vector operation because its input and output have a differing
1007 * number of vector components.
1009 * This method scalarizes packHalf2x16 by transforming it from an unary
1010 * operation having vector input to a binary operation having scalar input.
1011 * That is, it transforms
1013 * packHalf2x16(VEC2_RVAL);
1017 * vec2 v = VEC2_RVAL;
1018 * return packHalf2x16_split(v.x, v.y);
1021 split_pack_half_2x16(ir_rvalue
*vec2_rval
)
1023 assert(vec2_rval
->type
== glsl_type::vec2_type
);
1025 ir_variable
*v
= factory
.make_temp(glsl_type::vec2_type
,
1026 "tmp_split_pack_half_2x16_v");
1027 factory
.emit(assign(v
, vec2_rval
));
1029 return expr(ir_binop_pack_half_2x16_split
, swizzle_x(v
), swizzle_y(v
));
1033 * \brief Lower the component-wise calculation of unpackHalf2x16.
1035 * Given a uint that encodes a float16 in its lower 16 bits, this function
1036 * returns a uint that encodes a float32 with the same value. The sign bit
1037 * of the float16 is ignored.
1039 * \param e_rval is the unshifted exponent bits of a float16
1040 * \param m_rval is the unshifted mantissa bits of a float16
1041 * \param a uint rvalue that encodes a float32
1044 unpack_half_1x16_nosign(ir_rvalue
*e_rval
, ir_rvalue
*m_rval
)
1046 assert(e_rval
->type
== glsl_type::uint_type
);
1047 assert(m_rval
->type
== glsl_type::uint_type
);
1050 ir_variable
*u32
= factory
.make_temp(glsl_type::uint_type
,
1051 "tmp_unpack_half_1x16_u32");
1053 /* uint e = E_RVAL; */
1054 ir_variable
*e
= factory
.make_temp(glsl_type::uint_type
,
1055 "tmp_unpack_half_1x16_e");
1056 factory
.emit(assign(e
, e_rval
));
1058 /* uint m = M_RVAL; */
1059 ir_variable
*m
= factory
.make_temp(glsl_type::uint_type
,
1060 "tmp_unpack_half_1x16_m");
1061 factory
.emit(assign(m
, m_rval
));
1066 * For a float16, the bit layout is:
1072 * Let f16 be a float16 value. The sign, exponent, and mantissa
1073 * determine its value thus:
1075 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1076 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1077 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1078 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1079 * if e16 = 31 and m16 != 0, then NaN (5)
1081 * where 0 <= m16 < 2^10.
1083 * For a float32, the bit layout is:
1089 * Let f32 be a float32 value. The sign, exponent, and mantissa
1090 * determine its value thus:
1092 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1093 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1094 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1095 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1096 * if e32 = 255 and m32 != 0, then NaN (14)
1098 * where 0 <= m32 < 2^23.
1102 * Our task is to compute s32, e32, m32 given f16. Since this function
1103 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1109 /* Case 1) f16 is zero or subnormal.
1111 * The simplest method of calcuating f32 in this case is
1114 * = 2^(-14) * (m16 / 2^10) (21)
1115 * = m16 / 2^(-24) (22)
1118 /* if (e16 == 0) { */
1119 if_tree(equal(e
, constant(0u)),
1121 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1122 assign(u32
, expr(ir_unop_bitcast_f2u
,
1123 div(u2f(m
), constant((float)(1 << 24))))),
1125 /* Case 2) f16 is normal.
1130 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1131 * 2^(e16 - 15) * (1 + m16 / 2^10)
1133 * can be decomposed into two
1135 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1136 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1140 * e32 = e16 + 112 (34)
1141 * m32 = m16 * 2^13 (35)
1144 /* } else if (e16 < 31)) { */
1145 if_tree(less(e
, constant(31u << 10u)),
1147 /* u32 = ((e + (112 << 10)) | m) << 13;
1149 assign(u32
, lshift(bit_or(add(e
, constant(112u << 10u)), m
),
1153 /* Case 3) f16 is infinite. */
1154 if_tree(equal(m
, constant(0u)),
1156 assign(u32
, constant(255u << 23u)),
1158 /* Case 4) f16 is NaN. */
1161 assign(u32
, constant(0x7fffffffu
))))));
1165 return deref(u32
).val
;
1169 * \brief Lower an unpackHalf2x16 expression.
1171 * \param uint_rval is unpackHalf2x16's input
1172 * \return unpackHalf2x16's output as a vec2 rvalue
1175 lower_unpack_half_2x16(ir_rvalue
*uint_rval
)
1177 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1179 * mediump vec2 unpackHalf2x16 (highp uint v)
1180 * ------------------------------------------
1181 * Returns a two-component floating-point vector with components
1182 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1183 * values, interpreting those values as 16-bit floating-point numbers
1184 * according to the OpenGL ES Specification, and converting them to
1185 * 32-bit floating-point values.
1187 * The first component of the vector is obtained from the
1188 * 16 least-significant bits of v; the second component is obtained
1189 * from the 16 most-significant bits of v.
1191 assert(uint_rval
->type
== glsl_type::uint_type
);
1194 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1196 ir_variable
*f16
= factory
.make_temp(glsl_type::uvec2_type
,
1197 "tmp_unpack_half_2x16_f16");
1198 factory
.emit(assign(f16
, unpack_uint_to_uvec2(uint_rval
)));
1201 ir_variable
*f32
= factory
.make_temp(glsl_type::uvec2_type
,
1202 "tmp_unpack_half_2x16_f32");
1204 /* Get f16's unshifted exponent bits.
1206 * uvec2 e = f16 & 0x7c00u;
1208 ir_variable
*e
= factory
.make_temp(glsl_type::uvec2_type
,
1209 "tmp_unpack_half_2x16_e");
1210 factory
.emit(assign(e
, bit_and(f16
, constant(0x7c00u
))));
1212 /* Get f16's unshifted mantissa bits.
1214 * uvec2 m = f16 & 0x03ffu;
1216 ir_variable
*m
= factory
.make_temp(glsl_type::uvec2_type
,
1217 "tmp_unpack_half_2x16_m");
1218 factory
.emit(assign(m
, bit_and(f16
, constant(0x03ffu
))));
1220 /* Set f32's exponent and mantissa bits.
1222 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1223 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1225 factory
.emit(assign(f32
, unpack_half_1x16_nosign(swizzle_x(e
),
1228 factory
.emit(assign(f32
, unpack_half_1x16_nosign(swizzle_y(e
),
1232 /* Set f32's sign bit.
1234 * f32 |= (f16 & 0x8000u) << 16u;
1236 factory
.emit(assign(f32
, bit_or(f32
,
1241 /* return bitcast_u2f(f32); */
1242 ir_rvalue
*result
= expr(ir_unop_bitcast_u2f
, f32
);
1243 assert(result
->type
== glsl_type::vec2_type
);
1248 * \brief Split unpackHalf2x16 into two operations.
1250 * \param uint_rval is unpackHalf2x16's input
1251 * \return a vec2 rvalue
1253 * Some code generators, such as the i965 fragment shader, require that all
1254 * vector expressions be lowered to a sequence of scalar expressions.
1255 * However, unpackHalf2x16 cannot be scalarized by the same method as
1256 * a true vector operation because the number of components of its input
1257 * and output differ.
1259 * This method scalarizes unpackHalf2x16 by transforming it from a single
1260 * operation having vec2 output to a pair of operations each having float
1261 * output. That is, it transforms
1263 * unpackHalf2x16(UINT_RVAL)
1267 * uint u = UINT_RVAL;
1270 * v.x = unpackHalf2x16_split_x(u);
1271 * v.y = unpackHalf2x16_split_y(u);
1276 split_unpack_half_2x16(ir_rvalue
*uint_rval
)
1278 assert(uint_rval
->type
== glsl_type::uint_type
);
1280 /* uint u = uint_rval; */
1281 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
1282 "tmp_split_unpack_half_2x16_u");
1283 factory
.emit(assign(u
, uint_rval
));
1286 ir_variable
*v
= factory
.make_temp(glsl_type::vec2_type
,
1287 "tmp_split_unpack_half_2x16_v");
1289 /* v.x = unpack_half_2x16_split_x(u); */
1290 factory
.emit(assign(v
, expr(ir_unop_unpack_half_2x16_split_x
, u
),
1293 /* v.y = unpack_half_2x16_split_y(u); */
1294 factory
.emit(assign(v
, expr(ir_unop_unpack_half_2x16_split_y
, u
),
1297 return deref(v
).val
;
1301 } // namespace anonymous
1304 * \brief Lower the builtin packing functions.
1306 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1309 lower_packing_builtins(exec_list
*instructions
, int op_mask
)
1311 lower_packing_builtins_visitor
v(op_mask
);
1312 visit_list_elements(&v
, instructions
, true);
1313 return v
.get_progress();