2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
31 using namespace ir_builder
;
34 * A visitor that lowers built-in floating-point pack/unpack expressions
37 class lower_packing_builtins_visitor
: public ir_rvalue_visitor
{
40 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
42 explicit lower_packing_builtins_visitor(int op_mask
)
46 /* Mutually exclusive options. */
47 assert(!((op_mask
& LOWER_PACK_HALF_2x16
) &&
48 (op_mask
& LOWER_PACK_HALF_2x16_TO_SPLIT
)));
50 assert(!((op_mask
& LOWER_UNPACK_HALF_2x16
) &&
51 (op_mask
& LOWER_UNPACK_HALF_2x16_TO_SPLIT
)));
53 factory
.instructions
= &factory_instructions
;
56 virtual ~lower_packing_builtins_visitor()
58 assert(factory_instructions
.is_empty());
61 bool get_progress() { return progress
; }
63 void handle_rvalue(ir_rvalue
**rvalue
)
68 ir_expression
*expr
= (*rvalue
)->as_expression();
72 enum lower_packing_builtins_op lowering_op
=
73 choose_lowering_op(expr
->operation
);
75 if (lowering_op
== LOWER_PACK_UNPACK_NONE
)
78 setup_factory(ralloc_parent(expr
));
80 ir_rvalue
*op0
= expr
->operands
[0];
81 ralloc_steal(factory
.mem_ctx
, op0
);
83 switch (lowering_op
) {
84 case LOWER_PACK_SNORM_2x16
:
85 *rvalue
= lower_pack_snorm_2x16(op0
);
87 case LOWER_PACK_SNORM_4x8
:
88 *rvalue
= lower_pack_snorm_4x8(op0
);
90 case LOWER_PACK_UNORM_2x16
:
91 *rvalue
= lower_pack_unorm_2x16(op0
);
93 case LOWER_PACK_UNORM_4x8
:
94 *rvalue
= lower_pack_unorm_4x8(op0
);
96 case LOWER_PACK_HALF_2x16
:
97 *rvalue
= lower_pack_half_2x16(op0
);
99 case LOWER_PACK_HALF_2x16_TO_SPLIT
:
100 *rvalue
= split_pack_half_2x16(op0
);
102 case LOWER_UNPACK_SNORM_2x16
:
103 *rvalue
= lower_unpack_snorm_2x16(op0
);
105 case LOWER_UNPACK_SNORM_4x8
:
106 *rvalue
= lower_unpack_snorm_4x8(op0
);
108 case LOWER_UNPACK_UNORM_2x16
:
109 *rvalue
= lower_unpack_unorm_2x16(op0
);
111 case LOWER_UNPACK_UNORM_4x8
:
112 *rvalue
= lower_unpack_unorm_4x8(op0
);
114 case LOWER_UNPACK_HALF_2x16
:
115 *rvalue
= lower_unpack_half_2x16(op0
);
117 case LOWER_UNPACK_HALF_2x16_TO_SPLIT
:
118 *rvalue
= split_unpack_half_2x16(op0
);
120 case LOWER_PACK_UNPACK_NONE
:
121 case LOWER_PACK_USE_BFI
:
122 case LOWER_PACK_USE_BFE
:
123 assert(!"not reached");
135 exec_list factory_instructions
;
138 * Determine the needed lowering operation by filtering \a expr_op
139 * through \ref op_mask.
141 enum lower_packing_builtins_op
142 choose_lowering_op(ir_expression_operation expr_op
)
144 /* C++ regards int and enum as fundamentally different types.
145 * So, we can't simply return from each case; we must cast the return
151 case ir_unop_pack_snorm_2x16
:
152 result
= op_mask
& LOWER_PACK_SNORM_2x16
;
154 case ir_unop_pack_snorm_4x8
:
155 result
= op_mask
& LOWER_PACK_SNORM_4x8
;
157 case ir_unop_pack_unorm_2x16
:
158 result
= op_mask
& LOWER_PACK_UNORM_2x16
;
160 case ir_unop_pack_unorm_4x8
:
161 result
= op_mask
& LOWER_PACK_UNORM_4x8
;
163 case ir_unop_pack_half_2x16
:
164 result
= op_mask
& (LOWER_PACK_HALF_2x16
| LOWER_PACK_HALF_2x16_TO_SPLIT
);
166 case ir_unop_unpack_snorm_2x16
:
167 result
= op_mask
& LOWER_UNPACK_SNORM_2x16
;
169 case ir_unop_unpack_snorm_4x8
:
170 result
= op_mask
& LOWER_UNPACK_SNORM_4x8
;
172 case ir_unop_unpack_unorm_2x16
:
173 result
= op_mask
& LOWER_UNPACK_UNORM_2x16
;
175 case ir_unop_unpack_unorm_4x8
:
176 result
= op_mask
& LOWER_UNPACK_UNORM_4x8
;
178 case ir_unop_unpack_half_2x16
:
179 result
= op_mask
& (LOWER_UNPACK_HALF_2x16
| LOWER_UNPACK_HALF_2x16_TO_SPLIT
);
182 result
= LOWER_PACK_UNPACK_NONE
;
186 return static_cast<enum lower_packing_builtins_op
>(result
);
190 setup_factory(void *mem_ctx
)
192 assert(factory
.mem_ctx
== NULL
);
193 assert(factory
.instructions
->is_empty());
195 factory
.mem_ctx
= mem_ctx
;
201 base_ir
->insert_before(factory
.instructions
);
202 assert(factory
.instructions
->is_empty());
203 factory
.mem_ctx
= NULL
;
206 template <typename T
>
210 return factory
.constant(x
);
214 * \brief Pack two uint16's into a single uint32.
216 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
217 * where the least significant bits specify the first element of the pair.
221 pack_uvec2_to_uint(ir_rvalue
*uvec2_rval
)
223 assert(uvec2_rval
->type
== glsl_type::uvec2_type
);
225 /* uvec2 u = UVEC2_RVAL; */
226 ir_variable
*u
= factory
.make_temp(glsl_type::uvec2_type
,
227 "tmp_pack_uvec2_to_uint");
228 factory
.emit(assign(u
, uvec2_rval
));
230 if (op_mask
& LOWER_PACK_USE_BFI
) {
231 return bitfield_insert(bit_and(swizzle_x(u
), constant(0xffffu
)),
237 /* return (u.y << 16) | (u.x & 0xffff); */
238 return bit_or(lshift(swizzle_y(u
), constant(16u)),
239 bit_and(swizzle_x(u
), constant(0xffffu
)));
243 * \brief Pack four uint8's into a single uint32.
245 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
246 * uint32 where the least significant bits specify the first element of the
247 * 4-tuple. Return the uint32.
250 pack_uvec4_to_uint(ir_rvalue
*uvec4_rval
)
252 assert(uvec4_rval
->type
== glsl_type::uvec4_type
);
254 ir_variable
*u
= factory
.make_temp(glsl_type::uvec4_type
,
255 "tmp_pack_uvec4_to_uint");
257 if (op_mask
& LOWER_PACK_USE_BFI
) {
258 /* uvec4 u = UVEC4_RVAL; */
259 factory
.emit(assign(u
, uvec4_rval
));
261 return bitfield_insert(bitfield_insert(
263 bit_and(swizzle_x(u
), constant(0xffu
)),
264 swizzle_y(u
), constant(8), constant(8)),
265 swizzle_z(u
), constant(16), constant(8)),
266 swizzle_w(u
), constant(24), constant(8));
269 /* uvec4 u = UVEC4_RVAL & 0xff */
270 factory
.emit(assign(u
, bit_and(uvec4_rval
, constant(0xffu
))));
272 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
273 return bit_or(bit_or(lshift(swizzle_w(u
), constant(24u)),
274 lshift(swizzle_z(u
), constant(16u))),
275 bit_or(lshift(swizzle_y(u
), constant(8u)),
280 * \brief Unpack a uint32 into two uint16's.
282 * Interpret the given uint32 as a uint16 pair where the uint32's least
283 * significant bits specify the pair's first element. Return the uint16
287 unpack_uint_to_uvec2(ir_rvalue
*uint_rval
)
289 assert(uint_rval
->type
== glsl_type::uint_type
);
291 /* uint u = UINT_RVAL; */
292 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
293 "tmp_unpack_uint_to_uvec2_u");
294 factory
.emit(assign(u
, uint_rval
));
297 ir_variable
*u2
= factory
.make_temp(glsl_type::uvec2_type
,
298 "tmp_unpack_uint_to_uvec2_u2");
300 /* u2.x = u & 0xffffu; */
301 factory
.emit(assign(u2
, bit_and(u
, constant(0xffffu
)), WRITEMASK_X
));
303 /* u2.y = u >> 16u; */
304 factory
.emit(assign(u2
, rshift(u
, constant(16u)), WRITEMASK_Y
));
306 return deref(u2
).val
;
310 * \brief Unpack a uint32 into two int16's.
312 * Specifically each 16-bit value is sign-extended to the full width of an
316 unpack_uint_to_ivec2(ir_rvalue
*uint_rval
)
318 assert(uint_rval
->type
== glsl_type::uint_type
);
320 if (!(op_mask
& LOWER_PACK_USE_BFE
)) {
321 return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval
)),
326 ir_variable
*i
= factory
.make_temp(glsl_type::int_type
,
327 "tmp_unpack_uint_to_ivec2_i");
328 factory
.emit(assign(i
, u2i(uint_rval
)));
331 ir_variable
*i2
= factory
.make_temp(glsl_type::ivec2_type
,
332 "tmp_unpack_uint_to_ivec2_i2");
334 factory
.emit(assign(i2
, bitfield_extract(i
, constant(0), constant(16)),
336 factory
.emit(assign(i2
, bitfield_extract(i
, constant(16), constant(16)),
339 return deref(i2
).val
;
343 * \brief Unpack a uint32 into four uint8's.
345 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
346 * significant bits specify the 4-tuple's first element. Return the uint8
347 * 4-tuple as a uvec4.
350 unpack_uint_to_uvec4(ir_rvalue
*uint_rval
)
352 assert(uint_rval
->type
== glsl_type::uint_type
);
354 /* uint u = UINT_RVAL; */
355 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
356 "tmp_unpack_uint_to_uvec4_u");
357 factory
.emit(assign(u
, uint_rval
));
360 ir_variable
*u4
= factory
.make_temp(glsl_type::uvec4_type
,
361 "tmp_unpack_uint_to_uvec4_u4");
363 /* u4.x = u & 0xffu; */
364 factory
.emit(assign(u4
, bit_and(u
, constant(0xffu
)), WRITEMASK_X
));
366 if (op_mask
& LOWER_PACK_USE_BFE
) {
367 /* u4.y = bitfield_extract(u, 8, 8); */
368 factory
.emit(assign(u4
, bitfield_extract(u
, constant(8), constant(8)),
371 /* u4.z = bitfield_extract(u, 16, 8); */
372 factory
.emit(assign(u4
, bitfield_extract(u
, constant(16), constant(8)),
375 /* u4.y = (u >> 8u) & 0xffu; */
376 factory
.emit(assign(u4
, bit_and(rshift(u
, constant(8u)),
377 constant(0xffu
)), WRITEMASK_Y
));
379 /* u4.z = (u >> 16u) & 0xffu; */
380 factory
.emit(assign(u4
, bit_and(rshift(u
, constant(16u)),
381 constant(0xffu
)), WRITEMASK_Z
));
384 /* u4.w = (u >> 24u) */
385 factory
.emit(assign(u4
, rshift(u
, constant(24u)), WRITEMASK_W
));
387 return deref(u4
).val
;
391 * \brief Unpack a uint32 into four int8's.
393 * Specifically each 8-bit value is sign-extended to the full width of an
397 unpack_uint_to_ivec4(ir_rvalue
*uint_rval
)
399 assert(uint_rval
->type
== glsl_type::uint_type
);
401 if (!(op_mask
& LOWER_PACK_USE_BFE
)) {
402 return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval
)),
407 ir_variable
*i
= factory
.make_temp(glsl_type::int_type
,
408 "tmp_unpack_uint_to_ivec4_i");
409 factory
.emit(assign(i
, u2i(uint_rval
)));
412 ir_variable
*i4
= factory
.make_temp(glsl_type::ivec4_type
,
413 "tmp_unpack_uint_to_ivec4_i4");
415 factory
.emit(assign(i4
, bitfield_extract(i
, constant(0), constant(8)),
417 factory
.emit(assign(i4
, bitfield_extract(i
, constant(8), constant(8)),
419 factory
.emit(assign(i4
, bitfield_extract(i
, constant(16), constant(8)),
421 factory
.emit(assign(i4
, bitfield_extract(i
, constant(24), constant(8)),
424 return deref(i4
).val
;
428 * \brief Lower a packSnorm2x16 expression.
430 * \param vec2_rval is packSnorm2x16's input
431 * \return packSnorm2x16's output as a uint rvalue
434 lower_pack_snorm_2x16(ir_rvalue
*vec2_rval
)
436 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
438 * highp uint packSnorm2x16(vec2 v)
439 * --------------------------------
440 * First, converts each component of the normalized floating-point value
441 * v into 16-bit integer values. Then, the results are packed into the
442 * returned 32-bit unsigned integer.
444 * The conversion for component c of v to fixed point is done as
447 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
449 * The first component of the vector will be written to the least
450 * significant bits of the output; the last component will be written to
451 * the most significant bits.
453 * This function generates IR that approximates the following pseudo-GLSL:
455 * return pack_uvec2_to_uint(
457 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
459 * It is necessary to first convert the vec2 to ivec2 rather than directly
460 * converting vec2 to uvec2 because the latter conversion is undefined.
461 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
462 * convert a negative floating point value to an uint".
464 assert(vec2_rval
->type
== glsl_type::vec2_type
);
466 ir_rvalue
*result
= pack_uvec2_to_uint(
467 i2u(f2i(round_even(mul(clamp(vec2_rval
,
470 constant(32767.0f
))))));
472 assert(result
->type
== glsl_type::uint_type
);
477 * \brief Lower a packSnorm4x8 expression.
479 * \param vec4_rval is packSnorm4x8's input
480 * \return packSnorm4x8's output as a uint rvalue
483 lower_pack_snorm_4x8(ir_rvalue
*vec4_rval
)
485 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
487 * highp uint packSnorm4x8(vec4 v)
488 * -------------------------------
489 * First, converts each component of the normalized floating-point value
490 * v into 8-bit integer values. Then, the results are packed into the
491 * returned 32-bit unsigned integer.
493 * The conversion for component c of v to fixed point is done as
496 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
498 * The first component of the vector will be written to the least
499 * significant bits of the output; the last component will be written to
500 * the most significant bits.
502 * This function generates IR that approximates the following pseudo-GLSL:
504 * return pack_uvec4_to_uint(
506 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
508 * It is necessary to first convert the vec4 to ivec4 rather than directly
509 * converting vec4 to uvec4 because the latter conversion is undefined.
510 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
511 * convert a negative floating point value to an uint".
513 assert(vec4_rval
->type
== glsl_type::vec4_type
);
515 ir_rvalue
*result
= pack_uvec4_to_uint(
516 i2u(f2i(round_even(mul(clamp(vec4_rval
,
519 constant(127.0f
))))));
521 assert(result
->type
== glsl_type::uint_type
);
526 * \brief Lower an unpackSnorm2x16 expression.
528 * \param uint_rval is unpackSnorm2x16's input
529 * \return unpackSnorm2x16's output as a vec2 rvalue
532 lower_unpack_snorm_2x16(ir_rvalue
*uint_rval
)
534 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
536 * highp vec2 unpackSnorm2x16 (highp uint p)
537 * -----------------------------------------
538 * First, unpacks a single 32-bit unsigned integer p into a pair of
539 * 16-bit unsigned integers. Then, each component is converted to
540 * a normalized floating-point value to generate the returned
541 * two-component vector.
543 * The conversion for unpacked fixed-point value f to floating point is
546 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
548 * The first component of the returned vector will be extracted from the
549 * least significant bits of the input; the last component will be
550 * extracted from the most significant bits.
552 * This function generates IR that approximates the following pseudo-GLSL:
555 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
558 * The above IR may appear unnecessarily complex, but the intermediate
559 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
562 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
563 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
564 * place that int16 into an int32, which results in the *positive* integer
565 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
566 * unimportant bit 16. We must now extend the int16's sign bit into bits
567 * 17-32, which is accomplished by left-shifting then right-shifting.
570 assert(uint_rval
->type
== glsl_type::uint_type
);
573 clamp(div(i2f(unpack_uint_to_ivec2(uint_rval
)),
578 assert(result
->type
== glsl_type::vec2_type
);
583 * \brief Lower an unpackSnorm4x8 expression.
585 * \param uint_rval is unpackSnorm4x8's input
586 * \return unpackSnorm4x8's output as a vec4 rvalue
589 lower_unpack_snorm_4x8(ir_rvalue
*uint_rval
)
591 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
593 * highp vec4 unpackSnorm4x8 (highp uint p)
594 * ----------------------------------------
595 * First, unpacks a single 32-bit unsigned integer p into four
596 * 8-bit unsigned integers. Then, each component is converted to
597 * a normalized floating-point value to generate the returned
598 * four-component vector.
600 * The conversion for unpacked fixed-point value f to floating point is
603 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
605 * The first component of the returned vector will be extracted from the
606 * least significant bits of the input; the last component will be
607 * extracted from the most significant bits.
609 * This function generates IR that approximates the following pseudo-GLSL:
612 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
615 * The above IR may appear unnecessarily complex, but the intermediate
616 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
619 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
620 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
621 * place that int8 into an int32, which results in the *positive* integer
622 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
623 * unimportant bit 8. We must now extend the int8's sign bit into bits
624 * 9-32, which is accomplished by left-shifting then right-shifting.
627 assert(uint_rval
->type
== glsl_type::uint_type
);
630 clamp(div(i2f(unpack_uint_to_ivec4(uint_rval
)),
635 assert(result
->type
== glsl_type::vec4_type
);
640 * \brief Lower a packUnorm2x16 expression.
642 * \param vec2_rval is packUnorm2x16's input
643 * \return packUnorm2x16's output as a uint rvalue
646 lower_pack_unorm_2x16(ir_rvalue
*vec2_rval
)
648 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
650 * highp uint packUnorm2x16 (vec2 v)
651 * ---------------------------------
652 * First, converts each component of the normalized floating-point value
653 * v into 16-bit integer values. Then, the results are packed into the
654 * returned 32-bit unsigned integer.
656 * The conversion for component c of v to fixed point is done as
659 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
661 * The first component of the vector will be written to the least
662 * significant bits of the output; the last component will be written to
663 * the most significant bits.
665 * This function generates IR that approximates the following pseudo-GLSL:
667 * return pack_uvec2_to_uint(uvec2(
668 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
670 * Here it is safe to directly convert the vec2 to uvec2 because the vec2
671 * has been clamped to a non-negative range.
674 assert(vec2_rval
->type
== glsl_type::vec2_type
);
676 ir_rvalue
*result
= pack_uvec2_to_uint(
677 f2u(round_even(mul(saturate(vec2_rval
), constant(65535.0f
)))));
679 assert(result
->type
== glsl_type::uint_type
);
684 * \brief Lower a packUnorm4x8 expression.
686 * \param vec4_rval is packUnorm4x8's input
687 * \return packUnorm4x8's output as a uint rvalue
690 lower_pack_unorm_4x8(ir_rvalue
*vec4_rval
)
692 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
694 * highp uint packUnorm4x8 (vec4 v)
695 * --------------------------------
696 * First, converts each component of the normalized floating-point value
697 * v into 8-bit integer values. Then, the results are packed into the
698 * returned 32-bit unsigned integer.
700 * The conversion for component c of v to fixed point is done as
703 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
705 * The first component of the vector will be written to the least
706 * significant bits of the output; the last component will be written to
707 * the most significant bits.
709 * This function generates IR that approximates the following pseudo-GLSL:
711 * return pack_uvec4_to_uint(uvec4(
712 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
714 * Here it is safe to directly convert the vec4 to uvec4 because the vec4
715 * has been clamped to a non-negative range.
718 assert(vec4_rval
->type
== glsl_type::vec4_type
);
720 ir_rvalue
*result
= pack_uvec4_to_uint(
721 f2u(round_even(mul(saturate(vec4_rval
), constant(255.0f
)))));
723 assert(result
->type
== glsl_type::uint_type
);
728 * \brief Lower an unpackUnorm2x16 expression.
730 * \param uint_rval is unpackUnorm2x16's input
731 * \return unpackUnorm2x16's output as a vec2 rvalue
734 lower_unpack_unorm_2x16(ir_rvalue
*uint_rval
)
736 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
738 * highp vec2 unpackUnorm2x16 (highp uint p)
739 * -----------------------------------------
740 * First, unpacks a single 32-bit unsigned integer p into a pair of
741 * 16-bit unsigned integers. Then, each component is converted to
742 * a normalized floating-point value to generate the returned
743 * two-component vector.
745 * The conversion for unpacked fixed-point value f to floating point is
748 * unpackUnorm2x16: f / 65535.0
750 * The first component of the returned vector will be extracted from the
751 * least significant bits of the input; the last component will be
752 * extracted from the most significant bits.
754 * This function generates IR that approximates the following pseudo-GLSL:
756 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
759 assert(uint_rval
->type
== glsl_type::uint_type
);
761 ir_rvalue
*result
= div(u2f(unpack_uint_to_uvec2(uint_rval
)),
764 assert(result
->type
== glsl_type::vec2_type
);
769 * \brief Lower an unpackUnorm4x8 expression.
771 * \param uint_rval is unpackUnorm4x8's input
772 * \return unpackUnorm4x8's output as a vec4 rvalue
775 lower_unpack_unorm_4x8(ir_rvalue
*uint_rval
)
777 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
779 * highp vec4 unpackUnorm4x8 (highp uint p)
780 * ----------------------------------------
781 * First, unpacks a single 32-bit unsigned integer p into four
782 * 8-bit unsigned integers. Then, each component is converted to
783 * a normalized floating-point value to generate the returned
784 * two-component vector.
786 * The conversion for unpacked fixed-point value f to floating point is
789 * unpackUnorm4x8: f / 255.0
791 * The first component of the returned vector will be extracted from the
792 * least significant bits of the input; the last component will be
793 * extracted from the most significant bits.
795 * This function generates IR that approximates the following pseudo-GLSL:
797 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
800 assert(uint_rval
->type
== glsl_type::uint_type
);
802 ir_rvalue
*result
= div(u2f(unpack_uint_to_uvec4(uint_rval
)),
805 assert(result
->type
== glsl_type::vec4_type
);
810 * \brief Lower the component-wise calculation of packHalf2x16.
812 * \param f_rval is one component of packHafl2x16's input
813 * \param e_rval is the unshifted exponent bits of f_rval
814 * \param m_rval is the unshifted mantissa bits of f_rval
816 * \return a uint rvalue that encodes a float16 in its lower 16 bits
819 pack_half_1x16_nosign(ir_rvalue
*f_rval
,
823 assert(e_rval
->type
== glsl_type::uint_type
);
824 assert(m_rval
->type
== glsl_type::uint_type
);
827 ir_variable
*u16
= factory
.make_temp(glsl_type::uint_type
,
828 "tmp_pack_half_1x16_u16");
830 /* float f = FLOAT_RVAL; */
831 ir_variable
*f
= factory
.make_temp(glsl_type::float_type
,
832 "tmp_pack_half_1x16_f");
833 factory
.emit(assign(f
, f_rval
));
835 /* uint e = E_RVAL; */
836 ir_variable
*e
= factory
.make_temp(glsl_type::uint_type
,
837 "tmp_pack_half_1x16_e");
838 factory
.emit(assign(e
, e_rval
));
840 /* uint m = M_RVAL; */
841 ir_variable
*m
= factory
.make_temp(glsl_type::uint_type
,
842 "tmp_pack_half_1x16_m");
843 factory
.emit(assign(m
, m_rval
));
848 * For a float16, the bit layout is:
854 * Let f16 be a float16 value. The sign, exponent, and mantissa
855 * determine its value thus:
857 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
858 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
859 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
860 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
861 * if e16 = 31 and m16 != 0, then NaN (5)
863 * where 0 <= m16 < 2^10.
865 * For a float32, the bit layout is:
871 * Let f32 be a float32 value. The sign, exponent, and mantissa
872 * determine its value thus:
874 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
875 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
876 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
877 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
878 * if e32 = 255 and m32 != 0, then NaN (14)
880 * where 0 <= m32 < 2^23.
882 * The minimum and maximum normal float16 values are
884 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
885 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
887 * The step at max_norm16 is
889 * max_step16 = 2^5 (22)
891 * Observe that the float16 boundary values in equations 20-21 lie in the
892 * range of normal float32 values.
897 * Not all float32 values can be exactly represented as a float16. We
898 * round all such intermediate float32 values to the nearest float16; if
899 * the float32 is exactly between to float16 values, we round to the one
900 * with an even mantissa. This rounding behavior has several benefits:
902 * - It has no sign bias.
904 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
907 * - By reproducing the behavior of the GPU (at least on Intel hardware),
908 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
909 * result in the same value as if the expression were executed on the
914 * Our task is to compute s16, e16, m16 given f32. Since this function
915 * ignores the sign bit, assume that s32 = s16 = 0. There are several
921 /* Case 1) f32 is NaN
923 * The resultant f16 will also be NaN.
926 /* if (e32 == 255 && m32 != 0) { */
927 if_tree(logic_and(equal(e
, constant(0xffu
<< 23u)),
928 logic_not(equal(m
, constant(0u)))),
930 assign(u16
, constant(0x7fffu
)),
932 /* Case 2) f32 lies in the range [0, min_norm16).
934 * The resultant float16 will be either zero, subnormal, or normal.
938 * f32 = min_norm16 (30)
942 * e32 = 113 and m32 = 0 (31)
944 * Therefore this case occurs if and only if
949 /* } else if (e32 < 113) { */
950 if_tree(less(e
, constant(113u << 23u)),
952 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
953 assign(u16
, f2u(round_even(mul(expr(ir_unop_abs
, f
),
954 constant((float) (1 << 24)))))),
956 /* Case 3) f32 lies in the range
957 * [min_norm16, max_norm16 + max_step16).
959 * The resultant float16 will be either normal or infinite.
963 * f32 = max_norm16 + max_step16 (40)
964 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
968 * e32 = 143 and m32 = 0 (43)
970 * We already solved the boundary condition f32 = min_norm16 above
971 * in equation 31. Therefore this case occurs if and only if
973 * 113 <= e32 and e32 < 143
976 /* } else if (e32 < 143) { */
977 if_tree(less(e
, constant(143u << 23u)),
979 /* The addition below handles the case where the mantissa rounds
980 * up to 1024 and bumps the exponent.
982 * u16 = ((e - (112u << 23u)) >> 13u)
983 * + round_to_even((float(m) / (1u << 13u));
985 assign(u16
, add(rshift(sub(e
, constant(112u << 23u)),
988 div(u2f(m
), constant((float) (1 << 13))))))),
990 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
992 * The resultant float16 will be infinite.
994 * The cases above caught all float32 values in the range
995 * [0, max_norm16 + max_step16), so this is the fall-through case.
1000 assign(u16
, constant(31u << 10u))))));
1004 return deref(u16
).val
;
1008 * \brief Lower a packHalf2x16 expression.
1010 * \param vec2_rval is packHalf2x16's input
1011 * \return packHalf2x16's output as a uint rvalue
1014 lower_pack_half_2x16(ir_rvalue
*vec2_rval
)
1016 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1018 * highp uint packHalf2x16 (mediump vec2 v)
1019 * ----------------------------------------
1020 * Returns an unsigned integer obtained by converting the components of
1021 * a two-component floating-point vector to the 16-bit floating-point
1022 * representation found in the OpenGL ES Specification, and then packing
1023 * these two 16-bit integers into a 32-bit unsigned integer.
1025 * The first vector component specifies the 16 least- significant bits
1026 * of the result; the second component specifies the 16 most-significant
1030 assert(vec2_rval
->type
== glsl_type::vec2_type
);
1032 /* vec2 f = VEC2_RVAL; */
1033 ir_variable
*f
= factory
.make_temp(glsl_type::vec2_type
,
1034 "tmp_pack_half_2x16_f");
1035 factory
.emit(assign(f
, vec2_rval
));
1037 /* uvec2 f32 = bitcast_f2u(f); */
1038 ir_variable
*f32
= factory
.make_temp(glsl_type::uvec2_type
,
1039 "tmp_pack_half_2x16_f32");
1040 factory
.emit(assign(f32
, expr(ir_unop_bitcast_f2u
, f
)));
1043 ir_variable
*f16
= factory
.make_temp(glsl_type::uvec2_type
,
1044 "tmp_pack_half_2x16_f16");
1046 /* Get f32's unshifted exponent bits.
1048 * uvec2 e = f32 & 0x7f800000u;
1050 ir_variable
*e
= factory
.make_temp(glsl_type::uvec2_type
,
1051 "tmp_pack_half_2x16_e");
1052 factory
.emit(assign(e
, bit_and(f32
, constant(0x7f800000u
))));
1054 /* Get f32's unshifted mantissa bits.
1056 * uvec2 m = f32 & 0x007fffffu;
1058 ir_variable
*m
= factory
.make_temp(glsl_type::uvec2_type
,
1059 "tmp_pack_half_2x16_m");
1060 factory
.emit(assign(m
, bit_and(f32
, constant(0x007fffffu
))));
1062 /* Set f16's exponent and mantissa bits.
1064 * f16.x = pack_half_1x16_nosign(e.x, m.x);
1065 * f16.y = pack_half_1y16_nosign(e.y, m.y);
1067 factory
.emit(assign(f16
, pack_half_1x16_nosign(swizzle_x(f
),
1071 factory
.emit(assign(f16
, pack_half_1x16_nosign(swizzle_y(f
),
1076 /* Set f16's sign bits.
1078 * f16 |= (f32 & (1u << 31u) >> 16u;
1081 assign(f16
, bit_or(f16
,
1082 rshift(bit_and(f32
, constant(1u << 31u)),
1086 /* return (f16.y << 16u) | f16.x; */
1087 ir_rvalue
*result
= bit_or(lshift(swizzle_y(f16
),
1091 assert(result
->type
== glsl_type::uint_type
);
1096 * \brief Split packHalf2x16's vec2 operand into two floats.
1098 * \param vec2_rval is packHalf2x16's input
1099 * \return a uint rvalue
1101 * Some code generators, such as the i965 fragment shader, require that all
1102 * vector expressions be lowered to a sequence of scalar expressions.
1103 * However, packHalf2x16 cannot be scalarized by the same mechanism as
1104 * a true vector operation because its input and output have a differing
1105 * number of vector components.
1107 * This method scalarizes packHalf2x16 by transforming it from an unary
1108 * operation having vector input to a binary operation having scalar input.
1109 * That is, it transforms
1111 * packHalf2x16(VEC2_RVAL);
1115 * vec2 v = VEC2_RVAL;
1116 * return packHalf2x16_split(v.x, v.y);
1119 split_pack_half_2x16(ir_rvalue
*vec2_rval
)
1121 assert(vec2_rval
->type
== glsl_type::vec2_type
);
1123 ir_variable
*v
= factory
.make_temp(glsl_type::vec2_type
,
1124 "tmp_split_pack_half_2x16_v");
1125 factory
.emit(assign(v
, vec2_rval
));
1127 return expr(ir_binop_pack_half_2x16_split
, swizzle_x(v
), swizzle_y(v
));
1131 * \brief Lower the component-wise calculation of unpackHalf2x16.
1133 * Given a uint that encodes a float16 in its lower 16 bits, this function
1134 * returns a uint that encodes a float32 with the same value. The sign bit
1135 * of the float16 is ignored.
1137 * \param e_rval is the unshifted exponent bits of a float16
1138 * \param m_rval is the unshifted mantissa bits of a float16
1139 * \param a uint rvalue that encodes a float32
1142 unpack_half_1x16_nosign(ir_rvalue
*e_rval
, ir_rvalue
*m_rval
)
1144 assert(e_rval
->type
== glsl_type::uint_type
);
1145 assert(m_rval
->type
== glsl_type::uint_type
);
1148 ir_variable
*u32
= factory
.make_temp(glsl_type::uint_type
,
1149 "tmp_unpack_half_1x16_u32");
1151 /* uint e = E_RVAL; */
1152 ir_variable
*e
= factory
.make_temp(glsl_type::uint_type
,
1153 "tmp_unpack_half_1x16_e");
1154 factory
.emit(assign(e
, e_rval
));
1156 /* uint m = M_RVAL; */
1157 ir_variable
*m
= factory
.make_temp(glsl_type::uint_type
,
1158 "tmp_unpack_half_1x16_m");
1159 factory
.emit(assign(m
, m_rval
));
1164 * For a float16, the bit layout is:
1170 * Let f16 be a float16 value. The sign, exponent, and mantissa
1171 * determine its value thus:
1173 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1174 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1175 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1176 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1177 * if e16 = 31 and m16 != 0, then NaN (5)
1179 * where 0 <= m16 < 2^10.
1181 * For a float32, the bit layout is:
1187 * Let f32 be a float32 value. The sign, exponent, and mantissa
1188 * determine its value thus:
1190 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1191 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1192 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1193 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1194 * if e32 = 255 and m32 != 0, then NaN (14)
1196 * where 0 <= m32 < 2^23.
1200 * Our task is to compute s32, e32, m32 given f16. Since this function
1201 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1207 /* Case 1) f16 is zero or subnormal.
1209 * The simplest method of calcuating f32 in this case is
1212 * = 2^(-14) * (m16 / 2^10) (21)
1213 * = m16 / 2^(-24) (22)
1216 /* if (e16 == 0) { */
1217 if_tree(equal(e
, constant(0u)),
1219 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1220 assign(u32
, expr(ir_unop_bitcast_f2u
,
1221 div(u2f(m
), constant((float)(1 << 24))))),
1223 /* Case 2) f16 is normal.
1228 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1229 * 2^(e16 - 15) * (1 + m16 / 2^10)
1231 * can be decomposed into two
1233 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1234 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1238 * e32 = e16 + 112 (34)
1239 * m32 = m16 * 2^13 (35)
1242 /* } else if (e16 < 31)) { */
1243 if_tree(less(e
, constant(31u << 10u)),
1245 /* u32 = ((e + (112 << 10)) | m) << 13;
1247 assign(u32
, lshift(bit_or(add(e
, constant(112u << 10u)), m
),
1251 /* Case 3) f16 is infinite. */
1252 if_tree(equal(m
, constant(0u)),
1254 assign(u32
, constant(255u << 23u)),
1256 /* Case 4) f16 is NaN. */
1259 assign(u32
, constant(0x7fffffffu
))))));
1263 return deref(u32
).val
;
1267 * \brief Lower an unpackHalf2x16 expression.
1269 * \param uint_rval is unpackHalf2x16's input
1270 * \return unpackHalf2x16's output as a vec2 rvalue
1273 lower_unpack_half_2x16(ir_rvalue
*uint_rval
)
1275 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1277 * mediump vec2 unpackHalf2x16 (highp uint v)
1278 * ------------------------------------------
1279 * Returns a two-component floating-point vector with components
1280 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1281 * values, interpreting those values as 16-bit floating-point numbers
1282 * according to the OpenGL ES Specification, and converting them to
1283 * 32-bit floating-point values.
1285 * The first component of the vector is obtained from the
1286 * 16 least-significant bits of v; the second component is obtained
1287 * from the 16 most-significant bits of v.
1289 assert(uint_rval
->type
== glsl_type::uint_type
);
1292 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1294 ir_variable
*f16
= factory
.make_temp(glsl_type::uvec2_type
,
1295 "tmp_unpack_half_2x16_f16");
1296 factory
.emit(assign(f16
, unpack_uint_to_uvec2(uint_rval
)));
1299 ir_variable
*f32
= factory
.make_temp(glsl_type::uvec2_type
,
1300 "tmp_unpack_half_2x16_f32");
1302 /* Get f16's unshifted exponent bits.
1304 * uvec2 e = f16 & 0x7c00u;
1306 ir_variable
*e
= factory
.make_temp(glsl_type::uvec2_type
,
1307 "tmp_unpack_half_2x16_e");
1308 factory
.emit(assign(e
, bit_and(f16
, constant(0x7c00u
))));
1310 /* Get f16's unshifted mantissa bits.
1312 * uvec2 m = f16 & 0x03ffu;
1314 ir_variable
*m
= factory
.make_temp(glsl_type::uvec2_type
,
1315 "tmp_unpack_half_2x16_m");
1316 factory
.emit(assign(m
, bit_and(f16
, constant(0x03ffu
))));
1318 /* Set f32's exponent and mantissa bits.
1320 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1321 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1323 factory
.emit(assign(f32
, unpack_half_1x16_nosign(swizzle_x(e
),
1326 factory
.emit(assign(f32
, unpack_half_1x16_nosign(swizzle_y(e
),
1330 /* Set f32's sign bit.
1332 * f32 |= (f16 & 0x8000u) << 16u;
1334 factory
.emit(assign(f32
, bit_or(f32
,
1339 /* return bitcast_u2f(f32); */
1340 ir_rvalue
*result
= expr(ir_unop_bitcast_u2f
, f32
);
1341 assert(result
->type
== glsl_type::vec2_type
);
1346 * \brief Split unpackHalf2x16 into two operations.
1348 * \param uint_rval is unpackHalf2x16's input
1349 * \return a vec2 rvalue
1351 * Some code generators, such as the i965 fragment shader, require that all
1352 * vector expressions be lowered to a sequence of scalar expressions.
1353 * However, unpackHalf2x16 cannot be scalarized by the same method as
1354 * a true vector operation because the number of components of its input
1355 * and output differ.
1357 * This method scalarizes unpackHalf2x16 by transforming it from a single
1358 * operation having vec2 output to a pair of operations each having float
1359 * output. That is, it transforms
1361 * unpackHalf2x16(UINT_RVAL)
1365 * uint u = UINT_RVAL;
1368 * v.x = unpackHalf2x16_split_x(u);
1369 * v.y = unpackHalf2x16_split_y(u);
1374 split_unpack_half_2x16(ir_rvalue
*uint_rval
)
1376 assert(uint_rval
->type
== glsl_type::uint_type
);
1378 /* uint u = uint_rval; */
1379 ir_variable
*u
= factory
.make_temp(glsl_type::uint_type
,
1380 "tmp_split_unpack_half_2x16_u");
1381 factory
.emit(assign(u
, uint_rval
));
1384 ir_variable
*v
= factory
.make_temp(glsl_type::vec2_type
,
1385 "tmp_split_unpack_half_2x16_v");
1387 /* v.x = unpack_half_2x16_split_x(u); */
1388 factory
.emit(assign(v
, expr(ir_unop_unpack_half_2x16_split_x
, u
),
1391 /* v.y = unpack_half_2x16_split_y(u); */
1392 factory
.emit(assign(v
, expr(ir_unop_unpack_half_2x16_split_y
, u
),
1395 return deref(v
).val
;
1399 } // namespace anonymous
1402 * \brief Lower the builtin packing functions.
1404 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1407 lower_packing_builtins(exec_list
*instructions
, int op_mask
)
1409 lower_packing_builtins_visitor
v(op_mask
);
1410 visit_list_elements(&v
, instructions
, true);
1411 return v
.get_progress();