2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
31 using namespace ir_builder
;
34 * A visitor that lowers built-in floating-point pack/unpack expressions
37 class lower_packing_builtins_visitor
: public ir_rvalue_visitor
{
40 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
42 explicit lower_packing_builtins_visitor(int op_mask
)
46 factory
.instructions
= &factory_instructions
;
49 virtual ~lower_packing_builtins_visitor()
51 assert(factory_instructions
.is_empty());
54 bool get_progress() { return progress
; }
56 void handle_rvalue(ir_rvalue
**rvalue
)
61 ir_expression
*expr
= (*rvalue
)->as_expression();
65 enum lower_packing_builtins_op lowering_op
=
66 choose_lowering_op(expr
->operation
);
68 if (lowering_op
== LOWER_PACK_UNPACK_NONE
)
71 setup_factory(ralloc_parent(expr
));
73 ir_rvalue
*op0
= expr
->operands
[0];
74 ralloc_steal(factory
.mem_ctx
, op0
);
76 switch (lowering_op
) {
77 case LOWER_PACK_SNORM_2x16
:
78 *rvalue
= lower_pack_snorm_2x16(op0
);
80 case LOWER_PACK_SNORM_4x8
:
81 *rvalue
= lower_pack_snorm_4x8(op0
);
83 case LOWER_PACK_UNORM_2x16
:
84 *rvalue
= lower_pack_unorm_2x16(op0
);
86 case LOWER_PACK_UNORM_4x8
:
87 *rvalue
= lower_pack_unorm_4x8(op0
);
89 case LOWER_PACK_HALF_2x16
:
90 *rvalue
= lower_pack_half_2x16(op0
);
92 case LOWER_UNPACK_SNORM_2x16
:
93 *rvalue
= lower_unpack_snorm_2x16(op0
);
95 case LOWER_UNPACK_SNORM_4x8
:
96 *rvalue
= lower_unpack_snorm_4x8(op0
);
98 case LOWER_UNPACK_UNORM_2x16
:
99 *rvalue
= lower_unpack_unorm_2x16(op0
);
101 case LOWER_UNPACK_UNORM_4x8
:
102 *rvalue
= lower_unpack_unorm_4x8(op0
);
104 case LOWER_UNPACK_HALF_2x16
:
105 *rvalue
= lower_unpack_half_2x16(op0
);
107 case LOWER_PACK_UNPACK_NONE
:
108 case LOWER_PACK_USE_BFI
:
109 case LOWER_PACK_USE_BFE
:
110 assert(!"not reached");
122 exec_list factory_instructions
;
125 * Determine the needed lowering operation by filtering \a expr_op
126 * through \ref op_mask.
128 enum lower_packing_builtins_op
129 choose_lowering_op(ir_expression_operation expr_op
)
131 /* C++ regards int and enum as fundamentally different types.
132 * So, we can't simply return from each case; we must cast the return
138 case ir_unop_pack_snorm_2x16
:
139 result
= op_mask
& LOWER_PACK_SNORM_2x16
;
141 case ir_unop_pack_snorm_4x8
:
142 result
= op_mask
& LOWER_PACK_SNORM_4x8
;
144 case ir_unop_pack_unorm_2x16
:
145 result
= op_mask
& LOWER_PACK_UNORM_2x16
;
147 case ir_unop_pack_unorm_4x8
:
148 result
= op_mask
& LOWER_PACK_UNORM_4x8
;
150 case ir_unop_pack_half_2x16
:
151 result
= op_mask
& LOWER_PACK_HALF_2x16
;
153 case ir_unop_unpack_snorm_2x16
:
154 result
= op_mask
& LOWER_UNPACK_SNORM_2x16
;
156 case ir_unop_unpack_snorm_4x8
:
157 result
= op_mask
& LOWER_UNPACK_SNORM_4x8
;
159 case ir_unop_unpack_unorm_2x16
:
160 result
= op_mask
& LOWER_UNPACK_UNORM_2x16
;
162 case ir_unop_unpack_unorm_4x8
:
163 result
= op_mask
& LOWER_UNPACK_UNORM_4x8
;
165 case ir_unop_unpack_half_2x16
:
166 result
= op_mask
& LOWER_UNPACK_HALF_2x16
;
169 result
= LOWER_PACK_UNPACK_NONE
;
173 return static_cast<enum lower_packing_builtins_op
>(result
);
177 setup_factory(void *mem_ctx
)
179 assert(factory
.mem_ctx
== NULL
);
180 assert(factory
.instructions
->is_empty());
182 factory
.mem_ctx
= mem_ctx
;
188 base_ir
->insert_before(factory
.instructions
);
189 assert(factory
.instructions
->is_empty());
190 factory
.mem_ctx
= NULL
;
193 template <typename T
>
197 return factory
.constant(x
);
201 * \brief Pack two uint16's into a single uint32.
203 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
204 * where the least significant bits specify the first element of the pair.
208 pack_uvec2_to_uint(ir_rvalue
*uvec2_rval
)
210 assert(uvec2_rval
->type
== glsl_type::uvec2_type
);
212 /* uvec2 u = UVEC2_RVAL; */
213 ir_variable
*u
= factory
.make_temp(glsl_type::uvec2_type
,
214 "tmp_pack_uvec2_to_uint");
215 factory
.emit(assign(u
, uvec2_rval
));
217 if (op_mask
& LOWER_PACK_USE_BFI
) {
218 return bitfield_insert(bit_and(swizzle_x(u
), constant(0xffffu
)),
224 /* return (u.y << 16) | (u.x & 0xffff); */
225 return bit_or(lshift(swizzle_y(u
), constant(16u)),
226 bit_and(swizzle_x(u
), constant(0xffffu
)));
230 * \brief Pack four uint8's into a single uint32.
232 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
233 * uint32 where the least significant bits specify the first element of the
234 * 4-tuple. Return the uint32.
237 pack_uvec4_to_uint(ir_rvalue
*uvec4_rval
)
239 assert(uvec4_rval
->type
== glsl_type::uvec4_type
);
241 ir_variable
*u
= factory
.make_temp(glsl_type::uvec4_type
,
242 "tmp_pack_uvec4_to_uint");
244 if (op_mask
& LOWER_PACK_USE_BFI
) {
245 /* uvec4 u = UVEC4_RVAL; */
246 factory
.emit(assign(u
, uvec4_rval
));
248 return bitfield_insert(bitfield_insert(
250 bit_and(swizzle_x(u
), constant(0xffu
)),
251 swizzle_y(u
), constant(8u), constant(8u)),
252 swizzle_z(u
), constant(16u), constant(8u)),
253 swizzle_w(u
), constant(24u), constant(8u));
256 /* uvec4 u = UVEC4_RVAL & 0xff */
257 factory
.emit(assign(u
, bit_and(uvec4_rval
, constant(0xffu
))));
259 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
260 return bit_or(bit_or(lshift(swizzle_w(u
), constant(24u)),
261 lshift(swizzle_z(u
), constant(16u))),
262 bit_or(lshift(swizzle_y(u
), constant(8u)),
267 * \brief Unpack a uint32 into two uint16's.
269 * Interpret the given uint32 as a uint16 pair where the uint32's least
270 * significant bits specify the pair's first element. Return the uint16
274 unpack_uint_to_uvec2(ir_rvalue
*uint_rval
)
276 assert(uint_rval
->type
== glsl_type::uint_type
);
278 /* uint u = UINT_RVAL; */
279 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
280 "tmp_unpack_uint_to_uvec2_u");
281 factory
.emit(assign(u
, uint_rval
));
284 ir_variable
*u2
= factory
.make_temp(glsl_type::uvec2_type
,
285 "tmp_unpack_uint_to_uvec2_u2");
287 /* u2.x = u & 0xffffu; */
288 factory
.emit(assign(u2
, bit_and(u
, constant(0xffffu
)), WRITEMASK_X
));
290 /* u2.y = u >> 16u; */
291 factory
.emit(assign(u2
, rshift(u
, constant(16u)), WRITEMASK_Y
));
293 return deref(u2
).val
;
297 * \brief Unpack a uint32 into two int16's.
299 * Specifically each 16-bit value is sign-extended to the full width of an
303 unpack_uint_to_ivec2(ir_rvalue
*uint_rval
)
305 assert(uint_rval
->type
== glsl_type::uint_type
);
307 if (!(op_mask
& LOWER_PACK_USE_BFE
)) {
308 return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval
)),
313 ir_variable
*i
= factory
.make_temp(glsl_type::int_type
,
314 "tmp_unpack_uint_to_ivec2_i");
315 factory
.emit(assign(i
, u2i(uint_rval
)));
318 ir_variable
*i2
= factory
.make_temp(glsl_type::ivec2_type
,
319 "tmp_unpack_uint_to_ivec2_i2");
321 factory
.emit(assign(i2
, bitfield_extract(i
, constant(0), constant(16)),
323 factory
.emit(assign(i2
, bitfield_extract(i
, constant(16), constant(16)),
326 return deref(i2
).val
;
330 * \brief Unpack a uint32 into four uint8's.
332 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
333 * significant bits specify the 4-tuple's first element. Return the uint8
334 * 4-tuple as a uvec4.
337 unpack_uint_to_uvec4(ir_rvalue
*uint_rval
)
339 assert(uint_rval
->type
== glsl_type::uint_type
);
341 /* uint u = UINT_RVAL; */
342 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
343 "tmp_unpack_uint_to_uvec4_u");
344 factory
.emit(assign(u
, uint_rval
));
347 ir_variable
*u4
= factory
.make_temp(glsl_type::uvec4_type
,
348 "tmp_unpack_uint_to_uvec4_u4");
350 /* u4.x = u & 0xffu; */
351 factory
.emit(assign(u4
, bit_and(u
, constant(0xffu
)), WRITEMASK_X
));
353 if (op_mask
& LOWER_PACK_USE_BFE
) {
354 /* u4.y = bitfield_extract(u, 8, 8); */
355 factory
.emit(assign(u4
, bitfield_extract(u
, constant(8u), constant(8u)),
358 /* u4.z = bitfield_extract(u, 16, 8); */
359 factory
.emit(assign(u4
, bitfield_extract(u
, constant(16u), constant(8u)),
362 /* u4.y = (u >> 8u) & 0xffu; */
363 factory
.emit(assign(u4
, bit_and(rshift(u
, constant(8u)),
364 constant(0xffu
)), WRITEMASK_Y
));
366 /* u4.z = (u >> 16u) & 0xffu; */
367 factory
.emit(assign(u4
, bit_and(rshift(u
, constant(16u)),
368 constant(0xffu
)), WRITEMASK_Z
));
371 /* u4.w = (u >> 24u) */
372 factory
.emit(assign(u4
, rshift(u
, constant(24u)), WRITEMASK_W
));
374 return deref(u4
).val
;
378 * \brief Unpack a uint32 into four int8's.
380 * Specifically each 8-bit value is sign-extended to the full width of an
384 unpack_uint_to_ivec4(ir_rvalue
*uint_rval
)
386 assert(uint_rval
->type
== glsl_type::uint_type
);
388 if (!(op_mask
& LOWER_PACK_USE_BFE
)) {
389 return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval
)),
394 ir_variable
*i
= factory
.make_temp(glsl_type::int_type
,
395 "tmp_unpack_uint_to_ivec4_i");
396 factory
.emit(assign(i
, u2i(uint_rval
)));
399 ir_variable
*i4
= factory
.make_temp(glsl_type::ivec4_type
,
400 "tmp_unpack_uint_to_ivec4_i4");
402 factory
.emit(assign(i4
, bitfield_extract(i
, constant(0), constant(8)),
404 factory
.emit(assign(i4
, bitfield_extract(i
, constant(8), constant(8)),
406 factory
.emit(assign(i4
, bitfield_extract(i
, constant(16), constant(8)),
408 factory
.emit(assign(i4
, bitfield_extract(i
, constant(24), constant(8)),
411 return deref(i4
).val
;
415 * \brief Lower a packSnorm2x16 expression.
417 * \param vec2_rval is packSnorm2x16's input
418 * \return packSnorm2x16's output as a uint rvalue
421 lower_pack_snorm_2x16(ir_rvalue
*vec2_rval
)
423 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
425 * highp uint packSnorm2x16(vec2 v)
426 * --------------------------------
427 * First, converts each component of the normalized floating-point value
428 * v into 16-bit integer values. Then, the results are packed into the
429 * returned 32-bit unsigned integer.
431 * The conversion for component c of v to fixed point is done as
434 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
436 * The first component of the vector will be written to the least
437 * significant bits of the output; the last component will be written to
438 * the most significant bits.
440 * This function generates IR that approximates the following pseudo-GLSL:
442 * return pack_uvec2_to_uint(
444 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
446 * It is necessary to first convert the vec2 to ivec2 rather than directly
447 * converting vec2 to uvec2 because the latter conversion is undefined.
448 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
449 * convert a negative floating point value to an uint".
451 assert(vec2_rval
->type
== glsl_type::vec2_type
);
453 ir_rvalue
*result
= pack_uvec2_to_uint(
454 i2u(f2i(round_even(mul(clamp(vec2_rval
,
457 constant(32767.0f
))))));
459 assert(result
->type
== glsl_type::uint_type
);
464 * \brief Lower a packSnorm4x8 expression.
466 * \param vec4_rval is packSnorm4x8's input
467 * \return packSnorm4x8's output as a uint rvalue
470 lower_pack_snorm_4x8(ir_rvalue
*vec4_rval
)
472 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
474 * highp uint packSnorm4x8(vec4 v)
475 * -------------------------------
476 * First, converts each component of the normalized floating-point value
477 * v into 8-bit integer values. Then, the results are packed into the
478 * returned 32-bit unsigned integer.
480 * The conversion for component c of v to fixed point is done as
483 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
485 * The first component of the vector will be written to the least
486 * significant bits of the output; the last component will be written to
487 * the most significant bits.
489 * This function generates IR that approximates the following pseudo-GLSL:
491 * return pack_uvec4_to_uint(
493 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
495 * It is necessary to first convert the vec4 to ivec4 rather than directly
496 * converting vec4 to uvec4 because the latter conversion is undefined.
497 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
498 * convert a negative floating point value to an uint".
500 assert(vec4_rval
->type
== glsl_type::vec4_type
);
502 ir_rvalue
*result
= pack_uvec4_to_uint(
503 i2u(f2i(round_even(mul(clamp(vec4_rval
,
506 constant(127.0f
))))));
508 assert(result
->type
== glsl_type::uint_type
);
513 * \brief Lower an unpackSnorm2x16 expression.
515 * \param uint_rval is unpackSnorm2x16's input
516 * \return unpackSnorm2x16's output as a vec2 rvalue
519 lower_unpack_snorm_2x16(ir_rvalue
*uint_rval
)
521 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
523 * highp vec2 unpackSnorm2x16 (highp uint p)
524 * -----------------------------------------
525 * First, unpacks a single 32-bit unsigned integer p into a pair of
526 * 16-bit unsigned integers. Then, each component is converted to
527 * a normalized floating-point value to generate the returned
528 * two-component vector.
530 * The conversion for unpacked fixed-point value f to floating point is
533 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
535 * The first component of the returned vector will be extracted from the
536 * least significant bits of the input; the last component will be
537 * extracted from the most significant bits.
539 * This function generates IR that approximates the following pseudo-GLSL:
542 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
545 * The above IR may appear unnecessarily complex, but the intermediate
546 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
549 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
550 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
551 * place that int16 into an int32, which results in the *positive* integer
552 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
553 * unimportant bit 16. We must now extend the int16's sign bit into bits
554 * 17-32, which is accomplished by left-shifting then right-shifting.
557 assert(uint_rval
->type
== glsl_type::uint_type
);
560 clamp(div(i2f(unpack_uint_to_ivec2(uint_rval
)),
565 assert(result
->type
== glsl_type::vec2_type
);
570 * \brief Lower an unpackSnorm4x8 expression.
572 * \param uint_rval is unpackSnorm4x8's input
573 * \return unpackSnorm4x8's output as a vec4 rvalue
576 lower_unpack_snorm_4x8(ir_rvalue
*uint_rval
)
578 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
580 * highp vec4 unpackSnorm4x8 (highp uint p)
581 * ----------------------------------------
582 * First, unpacks a single 32-bit unsigned integer p into four
583 * 8-bit unsigned integers. Then, each component is converted to
584 * a normalized floating-point value to generate the returned
585 * four-component vector.
587 * The conversion for unpacked fixed-point value f to floating point is
590 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
592 * The first component of the returned vector will be extracted from the
593 * least significant bits of the input; the last component will be
594 * extracted from the most significant bits.
596 * This function generates IR that approximates the following pseudo-GLSL:
599 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
602 * The above IR may appear unnecessarily complex, but the intermediate
603 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
606 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
607 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
608 * place that int8 into an int32, which results in the *positive* integer
609 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
610 * unimportant bit 8. We must now extend the int8's sign bit into bits
611 * 9-32, which is accomplished by left-shifting then right-shifting.
614 assert(uint_rval
->type
== glsl_type::uint_type
);
617 clamp(div(i2f(unpack_uint_to_ivec4(uint_rval
)),
622 assert(result
->type
== glsl_type::vec4_type
);
627 * \brief Lower a packUnorm2x16 expression.
629 * \param vec2_rval is packUnorm2x16's input
630 * \return packUnorm2x16's output as a uint rvalue
633 lower_pack_unorm_2x16(ir_rvalue
*vec2_rval
)
635 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
637 * highp uint packUnorm2x16 (vec2 v)
638 * ---------------------------------
639 * First, converts each component of the normalized floating-point value
640 * v into 16-bit integer values. Then, the results are packed into the
641 * returned 32-bit unsigned integer.
643 * The conversion for component c of v to fixed point is done as
646 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
648 * The first component of the vector will be written to the least
649 * significant bits of the output; the last component will be written to
650 * the most significant bits.
652 * This function generates IR that approximates the following pseudo-GLSL:
654 * return pack_uvec2_to_uint(uvec2(
655 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
657 * Here it is safe to directly convert the vec2 to uvec2 because the vec2
658 * has been clamped to a non-negative range.
661 assert(vec2_rval
->type
== glsl_type::vec2_type
);
663 ir_rvalue
*result
= pack_uvec2_to_uint(
664 f2u(round_even(mul(saturate(vec2_rval
), constant(65535.0f
)))));
666 assert(result
->type
== glsl_type::uint_type
);
671 * \brief Lower a packUnorm4x8 expression.
673 * \param vec4_rval is packUnorm4x8's input
674 * \return packUnorm4x8's output as a uint rvalue
677 lower_pack_unorm_4x8(ir_rvalue
*vec4_rval
)
679 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
681 * highp uint packUnorm4x8 (vec4 v)
682 * --------------------------------
683 * First, converts each component of the normalized floating-point value
684 * v into 8-bit integer values. Then, the results are packed into the
685 * returned 32-bit unsigned integer.
687 * The conversion for component c of v to fixed point is done as
690 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
692 * The first component of the vector will be written to the least
693 * significant bits of the output; the last component will be written to
694 * the most significant bits.
696 * This function generates IR that approximates the following pseudo-GLSL:
698 * return pack_uvec4_to_uint(uvec4(
699 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
701 * Here it is safe to directly convert the vec4 to uvec4 because the vec4
702 * has been clamped to a non-negative range.
705 assert(vec4_rval
->type
== glsl_type::vec4_type
);
707 ir_rvalue
*result
= pack_uvec4_to_uint(
708 f2u(round_even(mul(saturate(vec4_rval
), constant(255.0f
)))));
710 assert(result
->type
== glsl_type::uint_type
);
715 * \brief Lower an unpackUnorm2x16 expression.
717 * \param uint_rval is unpackUnorm2x16's input
718 * \return unpackUnorm2x16's output as a vec2 rvalue
721 lower_unpack_unorm_2x16(ir_rvalue
*uint_rval
)
723 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
725 * highp vec2 unpackUnorm2x16 (highp uint p)
726 * -----------------------------------------
727 * First, unpacks a single 32-bit unsigned integer p into a pair of
728 * 16-bit unsigned integers. Then, each component is converted to
729 * a normalized floating-point value to generate the returned
730 * two-component vector.
732 * The conversion for unpacked fixed-point value f to floating point is
735 * unpackUnorm2x16: f / 65535.0
737 * The first component of the returned vector will be extracted from the
738 * least significant bits of the input; the last component will be
739 * extracted from the most significant bits.
741 * This function generates IR that approximates the following pseudo-GLSL:
743 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
746 assert(uint_rval
->type
== glsl_type::uint_type
);
748 ir_rvalue
*result
= div(u2f(unpack_uint_to_uvec2(uint_rval
)),
751 assert(result
->type
== glsl_type::vec2_type
);
756 * \brief Lower an unpackUnorm4x8 expression.
758 * \param uint_rval is unpackUnorm4x8's input
759 * \return unpackUnorm4x8's output as a vec4 rvalue
762 lower_unpack_unorm_4x8(ir_rvalue
*uint_rval
)
764 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
766 * highp vec4 unpackUnorm4x8 (highp uint p)
767 * ----------------------------------------
768 * First, unpacks a single 32-bit unsigned integer p into four
769 * 8-bit unsigned integers. Then, each component is converted to
770 * a normalized floating-point value to generate the returned
771 * two-component vector.
773 * The conversion for unpacked fixed-point value f to floating point is
776 * unpackUnorm4x8: f / 255.0
778 * The first component of the returned vector will be extracted from the
779 * least significant bits of the input; the last component will be
780 * extracted from the most significant bits.
782 * This function generates IR that approximates the following pseudo-GLSL:
784 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
787 assert(uint_rval
->type
== glsl_type::uint_type
);
789 ir_rvalue
*result
= div(u2f(unpack_uint_to_uvec4(uint_rval
)),
792 assert(result
->type
== glsl_type::vec4_type
);
797 * \brief Lower the component-wise calculation of packHalf2x16.
799 * \param f_rval is one component of packHafl2x16's input
800 * \param e_rval is the unshifted exponent bits of f_rval
801 * \param m_rval is the unshifted mantissa bits of f_rval
803 * \return a uint rvalue that encodes a float16 in its lower 16 bits
806 pack_half_1x16_nosign(ir_rvalue
*f_rval
,
810 assert(e_rval
->type
== glsl_type::uint_type
);
811 assert(m_rval
->type
== glsl_type::uint_type
);
814 ir_variable
*u16
= factory
.make_temp(glsl_type::uint_type
,
815 "tmp_pack_half_1x16_u16");
817 /* float f = FLOAT_RVAL; */
818 ir_variable
*f
= factory
.make_temp(glsl_type::float_type
,
819 "tmp_pack_half_1x16_f");
820 factory
.emit(assign(f
, f_rval
));
822 /* uint e = E_RVAL; */
823 ir_variable
*e
= factory
.make_temp(glsl_type::uint_type
,
824 "tmp_pack_half_1x16_e");
825 factory
.emit(assign(e
, e_rval
));
827 /* uint m = M_RVAL; */
828 ir_variable
*m
= factory
.make_temp(glsl_type::uint_type
,
829 "tmp_pack_half_1x16_m");
830 factory
.emit(assign(m
, m_rval
));
835 * For a float16, the bit layout is:
841 * Let f16 be a float16 value. The sign, exponent, and mantissa
842 * determine its value thus:
844 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
845 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
846 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
847 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
848 * if e16 = 31 and m16 != 0, then NaN (5)
850 * where 0 <= m16 < 2^10.
852 * For a float32, the bit layout is:
858 * Let f32 be a float32 value. The sign, exponent, and mantissa
859 * determine its value thus:
861 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
862 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
863 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
864 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
865 * if e32 = 255 and m32 != 0, then NaN (14)
867 * where 0 <= m32 < 2^23.
869 * The minimum and maximum normal float16 values are
871 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
872 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
874 * The step at max_norm16 is
876 * max_step16 = 2^5 (22)
878 * Observe that the float16 boundary values in equations 20-21 lie in the
879 * range of normal float32 values.
884 * Not all float32 values can be exactly represented as a float16. We
885 * round all such intermediate float32 values to the nearest float16; if
886 * the float32 is exactly between to float16 values, we round to the one
887 * with an even mantissa. This rounding behavior has several benefits:
889 * - It has no sign bias.
891 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
894 * - By reproducing the behavior of the GPU (at least on Intel hardware),
895 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
896 * result in the same value as if the expression were executed on the
901 * Our task is to compute s16, e16, m16 given f32. Since this function
902 * ignores the sign bit, assume that s32 = s16 = 0. There are several
908 /* Case 1) f32 is NaN
910 * The resultant f16 will also be NaN.
913 /* if (e32 == 255 && m32 != 0) { */
914 if_tree(logic_and(equal(e
, constant(0xffu
<< 23u)),
915 logic_not(equal(m
, constant(0u)))),
917 assign(u16
, constant(0x7fffu
)),
919 /* Case 2) f32 lies in the range [0, min_norm16).
921 * The resultant float16 will be either zero, subnormal, or normal.
925 * f32 = min_norm16 (30)
929 * e32 = 113 and m32 = 0 (31)
931 * Therefore this case occurs if and only if
936 /* } else if (e32 < 113) { */
937 if_tree(less(e
, constant(113u << 23u)),
939 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
940 assign(u16
, f2u(round_even(mul(expr(ir_unop_abs
, f
),
941 constant((float) (1 << 24)))))),
943 /* Case 3) f32 lies in the range
944 * [min_norm16, max_norm16 + max_step16).
946 * The resultant float16 will be either normal or infinite.
950 * f32 = max_norm16 + max_step16 (40)
951 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
955 * e32 = 143 and m32 = 0 (43)
957 * We already solved the boundary condition f32 = min_norm16 above
958 * in equation 31. Therefore this case occurs if and only if
960 * 113 <= e32 and e32 < 143
963 /* } else if (e32 < 143) { */
964 if_tree(less(e
, constant(143u << 23u)),
966 /* The addition below handles the case where the mantissa rounds
967 * up to 1024 and bumps the exponent.
969 * u16 = ((e - (112u << 23u)) >> 13u)
970 * + round_to_even((float(m) / (1u << 13u));
972 assign(u16
, add(rshift(sub(e
, constant(112u << 23u)),
975 div(u2f(m
), constant((float) (1 << 13))))))),
977 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
979 * The resultant float16 will be infinite.
981 * The cases above caught all float32 values in the range
982 * [0, max_norm16 + max_step16), so this is the fall-through case.
987 assign(u16
, constant(31u << 10u))))));
991 return deref(u16
).val
;
995 * \brief Lower a packHalf2x16 expression.
997 * \param vec2_rval is packHalf2x16's input
998 * \return packHalf2x16's output as a uint rvalue
1001 lower_pack_half_2x16(ir_rvalue
*vec2_rval
)
1003 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1005 * highp uint packHalf2x16 (mediump vec2 v)
1006 * ----------------------------------------
1007 * Returns an unsigned integer obtained by converting the components of
1008 * a two-component floating-point vector to the 16-bit floating-point
1009 * representation found in the OpenGL ES Specification, and then packing
1010 * these two 16-bit integers into a 32-bit unsigned integer.
1012 * The first vector component specifies the 16 least- significant bits
1013 * of the result; the second component specifies the 16 most-significant
1017 assert(vec2_rval
->type
== glsl_type::vec2_type
);
1019 /* vec2 f = VEC2_RVAL; */
1020 ir_variable
*f
= factory
.make_temp(glsl_type::vec2_type
,
1021 "tmp_pack_half_2x16_f");
1022 factory
.emit(assign(f
, vec2_rval
));
1024 /* uvec2 f32 = bitcast_f2u(f); */
1025 ir_variable
*f32
= factory
.make_temp(glsl_type::uvec2_type
,
1026 "tmp_pack_half_2x16_f32");
1027 factory
.emit(assign(f32
, expr(ir_unop_bitcast_f2u
, f
)));
1030 ir_variable
*f16
= factory
.make_temp(glsl_type::uvec2_type
,
1031 "tmp_pack_half_2x16_f16");
1033 /* Get f32's unshifted exponent bits.
1035 * uvec2 e = f32 & 0x7f800000u;
1037 ir_variable
*e
= factory
.make_temp(glsl_type::uvec2_type
,
1038 "tmp_pack_half_2x16_e");
1039 factory
.emit(assign(e
, bit_and(f32
, constant(0x7f800000u
))));
1041 /* Get f32's unshifted mantissa bits.
1043 * uvec2 m = f32 & 0x007fffffu;
1045 ir_variable
*m
= factory
.make_temp(glsl_type::uvec2_type
,
1046 "tmp_pack_half_2x16_m");
1047 factory
.emit(assign(m
, bit_and(f32
, constant(0x007fffffu
))));
1049 /* Set f16's exponent and mantissa bits.
1051 * f16.x = pack_half_1x16_nosign(e.x, m.x);
1052 * f16.y = pack_half_1y16_nosign(e.y, m.y);
1054 factory
.emit(assign(f16
, pack_half_1x16_nosign(swizzle_x(f
),
1058 factory
.emit(assign(f16
, pack_half_1x16_nosign(swizzle_y(f
),
1063 /* Set f16's sign bits.
1065 * f16 |= (f32 & (1u << 31u) >> 16u;
1068 assign(f16
, bit_or(f16
,
1069 rshift(bit_and(f32
, constant(1u << 31u)),
1073 /* return (f16.y << 16u) | f16.x; */
1074 ir_rvalue
*result
= bit_or(lshift(swizzle_y(f16
),
1078 assert(result
->type
== glsl_type::uint_type
);
1083 * \brief Lower the component-wise calculation of unpackHalf2x16.
1085 * Given a uint that encodes a float16 in its lower 16 bits, this function
1086 * returns a uint that encodes a float32 with the same value. The sign bit
1087 * of the float16 is ignored.
1089 * \param e_rval is the unshifted exponent bits of a float16
1090 * \param m_rval is the unshifted mantissa bits of a float16
1091 * \param a uint rvalue that encodes a float32
1094 unpack_half_1x16_nosign(ir_rvalue
*e_rval
, ir_rvalue
*m_rval
)
1096 assert(e_rval
->type
== glsl_type::uint_type
);
1097 assert(m_rval
->type
== glsl_type::uint_type
);
1100 ir_variable
*u32
= factory
.make_temp(glsl_type::uint_type
,
1101 "tmp_unpack_half_1x16_u32");
1103 /* uint e = E_RVAL; */
1104 ir_variable
*e
= factory
.make_temp(glsl_type::uint_type
,
1105 "tmp_unpack_half_1x16_e");
1106 factory
.emit(assign(e
, e_rval
));
1108 /* uint m = M_RVAL; */
1109 ir_variable
*m
= factory
.make_temp(glsl_type::uint_type
,
1110 "tmp_unpack_half_1x16_m");
1111 factory
.emit(assign(m
, m_rval
));
1116 * For a float16, the bit layout is:
1122 * Let f16 be a float16 value. The sign, exponent, and mantissa
1123 * determine its value thus:
1125 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1126 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1127 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1128 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1129 * if e16 = 31 and m16 != 0, then NaN (5)
1131 * where 0 <= m16 < 2^10.
1133 * For a float32, the bit layout is:
1139 * Let f32 be a float32 value. The sign, exponent, and mantissa
1140 * determine its value thus:
1142 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1143 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1144 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1145 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1146 * if e32 = 255 and m32 != 0, then NaN (14)
1148 * where 0 <= m32 < 2^23.
1152 * Our task is to compute s32, e32, m32 given f16. Since this function
1153 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1159 /* Case 1) f16 is zero or subnormal.
1161 * The simplest method of calcuating f32 in this case is
1164 * = 2^(-14) * (m16 / 2^10) (21)
1165 * = m16 / 2^(-24) (22)
1168 /* if (e16 == 0) { */
1169 if_tree(equal(e
, constant(0u)),
1171 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1172 assign(u32
, expr(ir_unop_bitcast_f2u
,
1173 div(u2f(m
), constant((float)(1 << 24))))),
1175 /* Case 2) f16 is normal.
1180 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1181 * 2^(e16 - 15) * (1 + m16 / 2^10)
1183 * can be decomposed into two
1185 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1186 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1190 * e32 = e16 + 112 (34)
1191 * m32 = m16 * 2^13 (35)
1194 /* } else if (e16 < 31)) { */
1195 if_tree(less(e
, constant(31u << 10u)),
1197 /* u32 = ((e + (112 << 10)) | m) << 13;
1199 assign(u32
, lshift(bit_or(add(e
, constant(112u << 10u)), m
),
1203 /* Case 3) f16 is infinite. */
1204 if_tree(equal(m
, constant(0u)),
1206 assign(u32
, constant(255u << 23u)),
1208 /* Case 4) f16 is NaN. */
1211 assign(u32
, constant(0x7fffffffu
))))));
1215 return deref(u32
).val
;
1219 * \brief Lower an unpackHalf2x16 expression.
1221 * \param uint_rval is unpackHalf2x16's input
1222 * \return unpackHalf2x16's output as a vec2 rvalue
1225 lower_unpack_half_2x16(ir_rvalue
*uint_rval
)
1227 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1229 * mediump vec2 unpackHalf2x16 (highp uint v)
1230 * ------------------------------------------
1231 * Returns a two-component floating-point vector with components
1232 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1233 * values, interpreting those values as 16-bit floating-point numbers
1234 * according to the OpenGL ES Specification, and converting them to
1235 * 32-bit floating-point values.
1237 * The first component of the vector is obtained from the
1238 * 16 least-significant bits of v; the second component is obtained
1239 * from the 16 most-significant bits of v.
1241 assert(uint_rval
->type
== glsl_type::uint_type
);
1244 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1246 ir_variable
*f16
= factory
.make_temp(glsl_type::uvec2_type
,
1247 "tmp_unpack_half_2x16_f16");
1248 factory
.emit(assign(f16
, unpack_uint_to_uvec2(uint_rval
)));
1251 ir_variable
*f32
= factory
.make_temp(glsl_type::uvec2_type
,
1252 "tmp_unpack_half_2x16_f32");
1254 /* Get f16's unshifted exponent bits.
1256 * uvec2 e = f16 & 0x7c00u;
1258 ir_variable
*e
= factory
.make_temp(glsl_type::uvec2_type
,
1259 "tmp_unpack_half_2x16_e");
1260 factory
.emit(assign(e
, bit_and(f16
, constant(0x7c00u
))));
1262 /* Get f16's unshifted mantissa bits.
1264 * uvec2 m = f16 & 0x03ffu;
1266 ir_variable
*m
= factory
.make_temp(glsl_type::uvec2_type
,
1267 "tmp_unpack_half_2x16_m");
1268 factory
.emit(assign(m
, bit_and(f16
, constant(0x03ffu
))));
1270 /* Set f32's exponent and mantissa bits.
1272 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1273 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1275 factory
.emit(assign(f32
, unpack_half_1x16_nosign(swizzle_x(e
),
1278 factory
.emit(assign(f32
, unpack_half_1x16_nosign(swizzle_y(e
),
1282 /* Set f32's sign bit.
1284 * f32 |= (f16 & 0x8000u) << 16u;
1286 factory
.emit(assign(f32
, bit_or(f32
,
1291 /* return bitcast_u2f(f32); */
1292 ir_rvalue
*result
= expr(ir_unop_bitcast_u2f
, f32
);
1293 assert(result
->type
== glsl_type::vec2_type
);
1298 } // namespace anonymous
1301 * \brief Lower the builtin packing functions.
1303 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1306 lower_packing_builtins(exec_list
*instructions
, int op_mask
)
1308 lower_packing_builtins_visitor
v(op_mask
);
1309 visit_list_elements(&v
, instructions
, true);
1310 return v
.get_progress();