nir: move to compiler/
[mesa.git] / src / glsl / lower_packing_builtins.cpp
1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "ir.h"
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
28
29 namespace {
30
31 using namespace ir_builder;
32
33 /**
34 * A visitor that lowers built-in floating-point pack/unpack expressions
35 * such packSnorm2x16.
36 */
37 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
38 public:
39 /**
40 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
41 */
42 explicit lower_packing_builtins_visitor(int op_mask)
43 : op_mask(op_mask),
44 progress(false)
45 {
46 /* Mutually exclusive options. */
47 assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
48 (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
49
50 assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
51 (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
52
53 factory.instructions = &factory_instructions;
54 }
55
56 virtual ~lower_packing_builtins_visitor()
57 {
58 assert(factory_instructions.is_empty());
59 }
60
61 bool get_progress() { return progress; }
62
63 void handle_rvalue(ir_rvalue **rvalue)
64 {
65 if (!*rvalue)
66 return;
67
68 ir_expression *expr = (*rvalue)->as_expression();
69 if (!expr)
70 return;
71
72 enum lower_packing_builtins_op lowering_op =
73 choose_lowering_op(expr->operation);
74
75 if (lowering_op == LOWER_PACK_UNPACK_NONE)
76 return;
77
78 setup_factory(ralloc_parent(expr));
79
80 ir_rvalue *op0 = expr->operands[0];
81 ralloc_steal(factory.mem_ctx, op0);
82
83 switch (lowering_op) {
84 case LOWER_PACK_SNORM_2x16:
85 *rvalue = lower_pack_snorm_2x16(op0);
86 break;
87 case LOWER_PACK_SNORM_4x8:
88 *rvalue = lower_pack_snorm_4x8(op0);
89 break;
90 case LOWER_PACK_UNORM_2x16:
91 *rvalue = lower_pack_unorm_2x16(op0);
92 break;
93 case LOWER_PACK_UNORM_4x8:
94 *rvalue = lower_pack_unorm_4x8(op0);
95 break;
96 case LOWER_PACK_HALF_2x16:
97 *rvalue = lower_pack_half_2x16(op0);
98 break;
99 case LOWER_PACK_HALF_2x16_TO_SPLIT:
100 *rvalue = split_pack_half_2x16(op0);
101 break;
102 case LOWER_UNPACK_SNORM_2x16:
103 *rvalue = lower_unpack_snorm_2x16(op0);
104 break;
105 case LOWER_UNPACK_SNORM_4x8:
106 *rvalue = lower_unpack_snorm_4x8(op0);
107 break;
108 case LOWER_UNPACK_UNORM_2x16:
109 *rvalue = lower_unpack_unorm_2x16(op0);
110 break;
111 case LOWER_UNPACK_UNORM_4x8:
112 *rvalue = lower_unpack_unorm_4x8(op0);
113 break;
114 case LOWER_UNPACK_HALF_2x16:
115 *rvalue = lower_unpack_half_2x16(op0);
116 break;
117 case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
118 *rvalue = split_unpack_half_2x16(op0);
119 break;
120 case LOWER_PACK_UNPACK_NONE:
121 case LOWER_PACK_USE_BFI:
122 case LOWER_PACK_USE_BFE:
123 assert(!"not reached");
124 break;
125 }
126
127 teardown_factory();
128 progress = true;
129 }
130
131 private:
132 const int op_mask;
133 bool progress;
134 ir_factory factory;
135 exec_list factory_instructions;
136
137 /**
138 * Determine the needed lowering operation by filtering \a expr_op
139 * through \ref op_mask.
140 */
141 enum lower_packing_builtins_op
142 choose_lowering_op(ir_expression_operation expr_op)
143 {
144 /* C++ regards int and enum as fundamentally different types.
145 * So, we can't simply return from each case; we must cast the return
146 * value.
147 */
148 int result;
149
150 switch (expr_op) {
151 case ir_unop_pack_snorm_2x16:
152 result = op_mask & LOWER_PACK_SNORM_2x16;
153 break;
154 case ir_unop_pack_snorm_4x8:
155 result = op_mask & LOWER_PACK_SNORM_4x8;
156 break;
157 case ir_unop_pack_unorm_2x16:
158 result = op_mask & LOWER_PACK_UNORM_2x16;
159 break;
160 case ir_unop_pack_unorm_4x8:
161 result = op_mask & LOWER_PACK_UNORM_4x8;
162 break;
163 case ir_unop_pack_half_2x16:
164 result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
165 break;
166 case ir_unop_unpack_snorm_2x16:
167 result = op_mask & LOWER_UNPACK_SNORM_2x16;
168 break;
169 case ir_unop_unpack_snorm_4x8:
170 result = op_mask & LOWER_UNPACK_SNORM_4x8;
171 break;
172 case ir_unop_unpack_unorm_2x16:
173 result = op_mask & LOWER_UNPACK_UNORM_2x16;
174 break;
175 case ir_unop_unpack_unorm_4x8:
176 result = op_mask & LOWER_UNPACK_UNORM_4x8;
177 break;
178 case ir_unop_unpack_half_2x16:
179 result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
180 break;
181 default:
182 result = LOWER_PACK_UNPACK_NONE;
183 break;
184 }
185
186 return static_cast<enum lower_packing_builtins_op>(result);
187 }
188
189 void
190 setup_factory(void *mem_ctx)
191 {
192 assert(factory.mem_ctx == NULL);
193 assert(factory.instructions->is_empty());
194
195 factory.mem_ctx = mem_ctx;
196 }
197
198 void
199 teardown_factory()
200 {
201 base_ir->insert_before(factory.instructions);
202 assert(factory.instructions->is_empty());
203 factory.mem_ctx = NULL;
204 }
205
206 template <typename T>
207 ir_constant*
208 constant(T x)
209 {
210 return factory.constant(x);
211 }
212
213 /**
214 * \brief Pack two uint16's into a single uint32.
215 *
216 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
217 * where the least significant bits specify the first element of the pair.
218 * Return the uint32.
219 */
220 ir_rvalue*
221 pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
222 {
223 assert(uvec2_rval->type == glsl_type::uvec2_type);
224
225 /* uvec2 u = UVEC2_RVAL; */
226 ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
227 "tmp_pack_uvec2_to_uint");
228 factory.emit(assign(u, uvec2_rval));
229
230 if (op_mask & LOWER_PACK_USE_BFI) {
231 return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
232 swizzle_y(u),
233 constant(16u),
234 constant(16u));
235 }
236
237 /* return (u.y << 16) | (u.x & 0xffff); */
238 return bit_or(lshift(swizzle_y(u), constant(16u)),
239 bit_and(swizzle_x(u), constant(0xffffu)));
240 }
241
242 /**
243 * \brief Pack four uint8's into a single uint32.
244 *
245 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
246 * uint32 where the least significant bits specify the first element of the
247 * 4-tuple. Return the uint32.
248 */
249 ir_rvalue*
250 pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
251 {
252 assert(uvec4_rval->type == glsl_type::uvec4_type);
253
254 ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
255 "tmp_pack_uvec4_to_uint");
256
257 if (op_mask & LOWER_PACK_USE_BFI) {
258 /* uvec4 u = UVEC4_RVAL; */
259 factory.emit(assign(u, uvec4_rval));
260
261 return bitfield_insert(bitfield_insert(
262 bitfield_insert(
263 bit_and(swizzle_x(u), constant(0xffu)),
264 swizzle_y(u), constant(8u), constant(8u)),
265 swizzle_z(u), constant(16u), constant(8u)),
266 swizzle_w(u), constant(24u), constant(8u));
267 }
268
269 /* uvec4 u = UVEC4_RVAL & 0xff */
270 factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
271
272 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
273 return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
274 lshift(swizzle_z(u), constant(16u))),
275 bit_or(lshift(swizzle_y(u), constant(8u)),
276 swizzle_x(u)));
277 }
278
279 /**
280 * \brief Unpack a uint32 into two uint16's.
281 *
282 * Interpret the given uint32 as a uint16 pair where the uint32's least
283 * significant bits specify the pair's first element. Return the uint16
284 * pair as a uvec2.
285 */
286 ir_rvalue*
287 unpack_uint_to_uvec2(ir_rvalue *uint_rval)
288 {
289 assert(uint_rval->type == glsl_type::uint_type);
290
291 /* uint u = UINT_RVAL; */
292 ir_variable *u = factory.make_temp(glsl_type::uint_type,
293 "tmp_unpack_uint_to_uvec2_u");
294 factory.emit(assign(u, uint_rval));
295
296 /* uvec2 u2; */
297 ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
298 "tmp_unpack_uint_to_uvec2_u2");
299
300 /* u2.x = u & 0xffffu; */
301 factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
302
303 /* u2.y = u >> 16u; */
304 factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
305
306 return deref(u2).val;
307 }
308
309 /**
310 * \brief Unpack a uint32 into two int16's.
311 *
312 * Specifically each 16-bit value is sign-extended to the full width of an
313 * int32 on return.
314 */
315 ir_rvalue *
316 unpack_uint_to_ivec2(ir_rvalue *uint_rval)
317 {
318 assert(uint_rval->type == glsl_type::uint_type);
319
320 if (!(op_mask & LOWER_PACK_USE_BFE)) {
321 return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
322 constant(16u)),
323 constant(16u));
324 }
325
326 ir_variable *i = factory.make_temp(glsl_type::int_type,
327 "tmp_unpack_uint_to_ivec2_i");
328 factory.emit(assign(i, u2i(uint_rval)));
329
330 /* ivec2 i2; */
331 ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
332 "tmp_unpack_uint_to_ivec2_i2");
333
334 factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
335 WRITEMASK_X));
336 factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
337 WRITEMASK_Y));
338
339 return deref(i2).val;
340 }
341
342 /**
343 * \brief Unpack a uint32 into four uint8's.
344 *
345 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
346 * significant bits specify the 4-tuple's first element. Return the uint8
347 * 4-tuple as a uvec4.
348 */
349 ir_rvalue*
350 unpack_uint_to_uvec4(ir_rvalue *uint_rval)
351 {
352 assert(uint_rval->type == glsl_type::uint_type);
353
354 /* uint u = UINT_RVAL; */
355 ir_variable *u = factory.make_temp(glsl_type::uint_type,
356 "tmp_unpack_uint_to_uvec4_u");
357 factory.emit(assign(u, uint_rval));
358
359 /* uvec4 u4; */
360 ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
361 "tmp_unpack_uint_to_uvec4_u4");
362
363 /* u4.x = u & 0xffu; */
364 factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
365
366 if (op_mask & LOWER_PACK_USE_BFE) {
367 /* u4.y = bitfield_extract(u, 8, 8); */
368 factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
369 WRITEMASK_Y));
370
371 /* u4.z = bitfield_extract(u, 16, 8); */
372 factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
373 WRITEMASK_Z));
374 } else {
375 /* u4.y = (u >> 8u) & 0xffu; */
376 factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
377 constant(0xffu)), WRITEMASK_Y));
378
379 /* u4.z = (u >> 16u) & 0xffu; */
380 factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
381 constant(0xffu)), WRITEMASK_Z));
382 }
383
384 /* u4.w = (u >> 24u) */
385 factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
386
387 return deref(u4).val;
388 }
389
390 /**
391 * \brief Unpack a uint32 into four int8's.
392 *
393 * Specifically each 8-bit value is sign-extended to the full width of an
394 * int32 on return.
395 */
396 ir_rvalue *
397 unpack_uint_to_ivec4(ir_rvalue *uint_rval)
398 {
399 assert(uint_rval->type == glsl_type::uint_type);
400
401 if (!(op_mask & LOWER_PACK_USE_BFE)) {
402 return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
403 constant(24u)),
404 constant(24u));
405 }
406
407 ir_variable *i = factory.make_temp(glsl_type::int_type,
408 "tmp_unpack_uint_to_ivec4_i");
409 factory.emit(assign(i, u2i(uint_rval)));
410
411 /* ivec4 i4; */
412 ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
413 "tmp_unpack_uint_to_ivec4_i4");
414
415 factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
416 WRITEMASK_X));
417 factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
418 WRITEMASK_Y));
419 factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
420 WRITEMASK_Z));
421 factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
422 WRITEMASK_W));
423
424 return deref(i4).val;
425 }
426
427 /**
428 * \brief Lower a packSnorm2x16 expression.
429 *
430 * \param vec2_rval is packSnorm2x16's input
431 * \return packSnorm2x16's output as a uint rvalue
432 */
433 ir_rvalue*
434 lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
435 {
436 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
437 *
438 * highp uint packSnorm2x16(vec2 v)
439 * --------------------------------
440 * First, converts each component of the normalized floating-point value
441 * v into 16-bit integer values. Then, the results are packed into the
442 * returned 32-bit unsigned integer.
443 *
444 * The conversion for component c of v to fixed point is done as
445 * follows:
446 *
447 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
448 *
449 * The first component of the vector will be written to the least
450 * significant bits of the output; the last component will be written to
451 * the most significant bits.
452 *
453 * This function generates IR that approximates the following pseudo-GLSL:
454 *
455 * return pack_uvec2_to_uint(
456 * uvec2(ivec2(
457 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
458 *
459 * It is necessary to first convert the vec2 to ivec2 rather than directly
460 * converting vec2 to uvec2 because the latter conversion is undefined.
461 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
462 * convert a negative floating point value to an uint".
463 */
464 assert(vec2_rval->type == glsl_type::vec2_type);
465
466 ir_rvalue *result = pack_uvec2_to_uint(
467 i2u(f2i(round_even(mul(clamp(vec2_rval,
468 constant(-1.0f),
469 constant(1.0f)),
470 constant(32767.0f))))));
471
472 assert(result->type == glsl_type::uint_type);
473 return result;
474 }
475
476 /**
477 * \brief Lower a packSnorm4x8 expression.
478 *
479 * \param vec4_rval is packSnorm4x8's input
480 * \return packSnorm4x8's output as a uint rvalue
481 */
482 ir_rvalue*
483 lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
484 {
485 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
486 *
487 * highp uint packSnorm4x8(vec4 v)
488 * -------------------------------
489 * First, converts each component of the normalized floating-point value
490 * v into 8-bit integer values. Then, the results are packed into the
491 * returned 32-bit unsigned integer.
492 *
493 * The conversion for component c of v to fixed point is done as
494 * follows:
495 *
496 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
497 *
498 * The first component of the vector will be written to the least
499 * significant bits of the output; the last component will be written to
500 * the most significant bits.
501 *
502 * This function generates IR that approximates the following pseudo-GLSL:
503 *
504 * return pack_uvec4_to_uint(
505 * uvec4(ivec4(
506 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
507 *
508 * It is necessary to first convert the vec4 to ivec4 rather than directly
509 * converting vec4 to uvec4 because the latter conversion is undefined.
510 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
511 * convert a negative floating point value to an uint".
512 */
513 assert(vec4_rval->type == glsl_type::vec4_type);
514
515 ir_rvalue *result = pack_uvec4_to_uint(
516 i2u(f2i(round_even(mul(clamp(vec4_rval,
517 constant(-1.0f),
518 constant(1.0f)),
519 constant(127.0f))))));
520
521 assert(result->type == glsl_type::uint_type);
522 return result;
523 }
524
525 /**
526 * \brief Lower an unpackSnorm2x16 expression.
527 *
528 * \param uint_rval is unpackSnorm2x16's input
529 * \return unpackSnorm2x16's output as a vec2 rvalue
530 */
531 ir_rvalue*
532 lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
533 {
534 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
535 *
536 * highp vec2 unpackSnorm2x16 (highp uint p)
537 * -----------------------------------------
538 * First, unpacks a single 32-bit unsigned integer p into a pair of
539 * 16-bit unsigned integers. Then, each component is converted to
540 * a normalized floating-point value to generate the returned
541 * two-component vector.
542 *
543 * The conversion for unpacked fixed-point value f to floating point is
544 * done as follows:
545 *
546 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
547 *
548 * The first component of the returned vector will be extracted from the
549 * least significant bits of the input; the last component will be
550 * extracted from the most significant bits.
551 *
552 * This function generates IR that approximates the following pseudo-GLSL:
553 *
554 * return clamp(
555 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
556 * -1.0f, 1.0f);
557 *
558 * The above IR may appear unnecessarily complex, but the intermediate
559 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
560 * negative floats.
561 *
562 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
563 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
564 * place that int16 into an int32, which results in the *positive* integer
565 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
566 * unimportant bit 16. We must now extend the int16's sign bit into bits
567 * 17-32, which is accomplished by left-shifting then right-shifting.
568 */
569
570 assert(uint_rval->type == glsl_type::uint_type);
571
572 ir_rvalue *result =
573 clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
574 constant(32767.0f)),
575 constant(-1.0f),
576 constant(1.0f));
577
578 assert(result->type == glsl_type::vec2_type);
579 return result;
580 }
581
582 /**
583 * \brief Lower an unpackSnorm4x8 expression.
584 *
585 * \param uint_rval is unpackSnorm4x8's input
586 * \return unpackSnorm4x8's output as a vec4 rvalue
587 */
588 ir_rvalue*
589 lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
590 {
591 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
592 *
593 * highp vec4 unpackSnorm4x8 (highp uint p)
594 * ----------------------------------------
595 * First, unpacks a single 32-bit unsigned integer p into four
596 * 8-bit unsigned integers. Then, each component is converted to
597 * a normalized floating-point value to generate the returned
598 * four-component vector.
599 *
600 * The conversion for unpacked fixed-point value f to floating point is
601 * done as follows:
602 *
603 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
604 *
605 * The first component of the returned vector will be extracted from the
606 * least significant bits of the input; the last component will be
607 * extracted from the most significant bits.
608 *
609 * This function generates IR that approximates the following pseudo-GLSL:
610 *
611 * return clamp(
612 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
613 * -1.0f, 1.0f);
614 *
615 * The above IR may appear unnecessarily complex, but the intermediate
616 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
617 * negative floats.
618 *
619 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
620 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
621 * place that int8 into an int32, which results in the *positive* integer
622 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
623 * unimportant bit 8. We must now extend the int8's sign bit into bits
624 * 9-32, which is accomplished by left-shifting then right-shifting.
625 */
626
627 assert(uint_rval->type == glsl_type::uint_type);
628
629 ir_rvalue *result =
630 clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
631 constant(127.0f)),
632 constant(-1.0f),
633 constant(1.0f));
634
635 assert(result->type == glsl_type::vec4_type);
636 return result;
637 }
638
639 /**
640 * \brief Lower a packUnorm2x16 expression.
641 *
642 * \param vec2_rval is packUnorm2x16's input
643 * \return packUnorm2x16's output as a uint rvalue
644 */
645 ir_rvalue*
646 lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
647 {
648 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
649 *
650 * highp uint packUnorm2x16 (vec2 v)
651 * ---------------------------------
652 * First, converts each component of the normalized floating-point value
653 * v into 16-bit integer values. Then, the results are packed into the
654 * returned 32-bit unsigned integer.
655 *
656 * The conversion for component c of v to fixed point is done as
657 * follows:
658 *
659 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
660 *
661 * The first component of the vector will be written to the least
662 * significant bits of the output; the last component will be written to
663 * the most significant bits.
664 *
665 * This function generates IR that approximates the following pseudo-GLSL:
666 *
667 * return pack_uvec2_to_uint(uvec2(
668 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
669 *
670 * Here it is safe to directly convert the vec2 to uvec2 because the vec2
671 * has been clamped to a non-negative range.
672 */
673
674 assert(vec2_rval->type == glsl_type::vec2_type);
675
676 ir_rvalue *result = pack_uvec2_to_uint(
677 f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
678
679 assert(result->type == glsl_type::uint_type);
680 return result;
681 }
682
683 /**
684 * \brief Lower a packUnorm4x8 expression.
685 *
686 * \param vec4_rval is packUnorm4x8's input
687 * \return packUnorm4x8's output as a uint rvalue
688 */
689 ir_rvalue*
690 lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
691 {
692 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
693 *
694 * highp uint packUnorm4x8 (vec4 v)
695 * --------------------------------
696 * First, converts each component of the normalized floating-point value
697 * v into 8-bit integer values. Then, the results are packed into the
698 * returned 32-bit unsigned integer.
699 *
700 * The conversion for component c of v to fixed point is done as
701 * follows:
702 *
703 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
704 *
705 * The first component of the vector will be written to the least
706 * significant bits of the output; the last component will be written to
707 * the most significant bits.
708 *
709 * This function generates IR that approximates the following pseudo-GLSL:
710 *
711 * return pack_uvec4_to_uint(uvec4(
712 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
713 *
714 * Here it is safe to directly convert the vec4 to uvec4 because the vec4
715 * has been clamped to a non-negative range.
716 */
717
718 assert(vec4_rval->type == glsl_type::vec4_type);
719
720 ir_rvalue *result = pack_uvec4_to_uint(
721 f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
722
723 assert(result->type == glsl_type::uint_type);
724 return result;
725 }
726
727 /**
728 * \brief Lower an unpackUnorm2x16 expression.
729 *
730 * \param uint_rval is unpackUnorm2x16's input
731 * \return unpackUnorm2x16's output as a vec2 rvalue
732 */
733 ir_rvalue*
734 lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
735 {
736 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
737 *
738 * highp vec2 unpackUnorm2x16 (highp uint p)
739 * -----------------------------------------
740 * First, unpacks a single 32-bit unsigned integer p into a pair of
741 * 16-bit unsigned integers. Then, each component is converted to
742 * a normalized floating-point value to generate the returned
743 * two-component vector.
744 *
745 * The conversion for unpacked fixed-point value f to floating point is
746 * done as follows:
747 *
748 * unpackUnorm2x16: f / 65535.0
749 *
750 * The first component of the returned vector will be extracted from the
751 * least significant bits of the input; the last component will be
752 * extracted from the most significant bits.
753 *
754 * This function generates IR that approximates the following pseudo-GLSL:
755 *
756 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
757 */
758
759 assert(uint_rval->type == glsl_type::uint_type);
760
761 ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
762 constant(65535.0f));
763
764 assert(result->type == glsl_type::vec2_type);
765 return result;
766 }
767
768 /**
769 * \brief Lower an unpackUnorm4x8 expression.
770 *
771 * \param uint_rval is unpackUnorm4x8's input
772 * \return unpackUnorm4x8's output as a vec4 rvalue
773 */
774 ir_rvalue*
775 lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
776 {
777 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
778 *
779 * highp vec4 unpackUnorm4x8 (highp uint p)
780 * ----------------------------------------
781 * First, unpacks a single 32-bit unsigned integer p into four
782 * 8-bit unsigned integers. Then, each component is converted to
783 * a normalized floating-point value to generate the returned
784 * two-component vector.
785 *
786 * The conversion for unpacked fixed-point value f to floating point is
787 * done as follows:
788 *
789 * unpackUnorm4x8: f / 255.0
790 *
791 * The first component of the returned vector will be extracted from the
792 * least significant bits of the input; the last component will be
793 * extracted from the most significant bits.
794 *
795 * This function generates IR that approximates the following pseudo-GLSL:
796 *
797 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
798 */
799
800 assert(uint_rval->type == glsl_type::uint_type);
801
802 ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
803 constant(255.0f));
804
805 assert(result->type == glsl_type::vec4_type);
806 return result;
807 }
808
809 /**
810 * \brief Lower the component-wise calculation of packHalf2x16.
811 *
812 * \param f_rval is one component of packHafl2x16's input
813 * \param e_rval is the unshifted exponent bits of f_rval
814 * \param m_rval is the unshifted mantissa bits of f_rval
815 *
816 * \return a uint rvalue that encodes a float16 in its lower 16 bits
817 */
818 ir_rvalue*
819 pack_half_1x16_nosign(ir_rvalue *f_rval,
820 ir_rvalue *e_rval,
821 ir_rvalue *m_rval)
822 {
823 assert(e_rval->type == glsl_type::uint_type);
824 assert(m_rval->type == glsl_type::uint_type);
825
826 /* uint u16; */
827 ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
828 "tmp_pack_half_1x16_u16");
829
830 /* float f = FLOAT_RVAL; */
831 ir_variable *f = factory.make_temp(glsl_type::float_type,
832 "tmp_pack_half_1x16_f");
833 factory.emit(assign(f, f_rval));
834
835 /* uint e = E_RVAL; */
836 ir_variable *e = factory.make_temp(glsl_type::uint_type,
837 "tmp_pack_half_1x16_e");
838 factory.emit(assign(e, e_rval));
839
840 /* uint m = M_RVAL; */
841 ir_variable *m = factory.make_temp(glsl_type::uint_type,
842 "tmp_pack_half_1x16_m");
843 factory.emit(assign(m, m_rval));
844
845 /* Preliminaries
846 * -------------
847 *
848 * For a float16, the bit layout is:
849 *
850 * sign: 15
851 * exponent: 10:14
852 * mantissa: 0:9
853 *
854 * Let f16 be a float16 value. The sign, exponent, and mantissa
855 * determine its value thus:
856 *
857 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
858 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
859 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
860 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
861 * if e16 = 31 and m16 != 0, then NaN (5)
862 *
863 * where 0 <= m16 < 2^10.
864 *
865 * For a float32, the bit layout is:
866 *
867 * sign: 31
868 * exponent: 23:30
869 * mantissa: 0:22
870 *
871 * Let f32 be a float32 value. The sign, exponent, and mantissa
872 * determine its value thus:
873 *
874 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
875 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
876 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
877 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
878 * if e32 = 255 and m32 != 0, then NaN (14)
879 *
880 * where 0 <= m32 < 2^23.
881 *
882 * The minimum and maximum normal float16 values are
883 *
884 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
885 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
886 *
887 * The step at max_norm16 is
888 *
889 * max_step16 = 2^5 (22)
890 *
891 * Observe that the float16 boundary values in equations 20-21 lie in the
892 * range of normal float32 values.
893 *
894 *
895 * Rounding Behavior
896 * -----------------
897 * Not all float32 values can be exactly represented as a float16. We
898 * round all such intermediate float32 values to the nearest float16; if
899 * the float32 is exactly between to float16 values, we round to the one
900 * with an even mantissa. This rounding behavior has several benefits:
901 *
902 * - It has no sign bias.
903 *
904 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
905 * GPU ISA.
906 *
907 * - By reproducing the behavior of the GPU (at least on Intel hardware),
908 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
909 * result in the same value as if the expression were executed on the
910 * GPU.
911 *
912 * Calculation
913 * -----------
914 * Our task is to compute s16, e16, m16 given f32. Since this function
915 * ignores the sign bit, assume that s32 = s16 = 0. There are several
916 * cases consider.
917 */
918
919 factory.emit(
920
921 /* Case 1) f32 is NaN
922 *
923 * The resultant f16 will also be NaN.
924 */
925
926 /* if (e32 == 255 && m32 != 0) { */
927 if_tree(logic_and(equal(e, constant(0xffu << 23u)),
928 logic_not(equal(m, constant(0u)))),
929
930 assign(u16, constant(0x7fffu)),
931
932 /* Case 2) f32 lies in the range [0, min_norm16).
933 *
934 * The resultant float16 will be either zero, subnormal, or normal.
935 *
936 * Solving
937 *
938 * f32 = min_norm16 (30)
939 *
940 * gives
941 *
942 * e32 = 113 and m32 = 0 (31)
943 *
944 * Therefore this case occurs if and only if
945 *
946 * e32 < 113 (32)
947 */
948
949 /* } else if (e32 < 113) { */
950 if_tree(less(e, constant(113u << 23u)),
951
952 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
953 assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
954 constant((float) (1 << 24)))))),
955
956 /* Case 3) f32 lies in the range
957 * [min_norm16, max_norm16 + max_step16).
958 *
959 * The resultant float16 will be either normal or infinite.
960 *
961 * Solving
962 *
963 * f32 = max_norm16 + max_step16 (40)
964 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
965 * = 2^16 (42)
966 * gives
967 *
968 * e32 = 143 and m32 = 0 (43)
969 *
970 * We already solved the boundary condition f32 = min_norm16 above
971 * in equation 31. Therefore this case occurs if and only if
972 *
973 * 113 <= e32 and e32 < 143
974 */
975
976 /* } else if (e32 < 143) { */
977 if_tree(less(e, constant(143u << 23u)),
978
979 /* The addition below handles the case where the mantissa rounds
980 * up to 1024 and bumps the exponent.
981 *
982 * u16 = ((e - (112u << 23u)) >> 13u)
983 * + round_to_even((float(m) / (1u << 13u));
984 */
985 assign(u16, add(rshift(sub(e, constant(112u << 23u)),
986 constant(13u)),
987 f2u(round_even(
988 div(u2f(m), constant((float) (1 << 13))))))),
989
990 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
991 *
992 * The resultant float16 will be infinite.
993 *
994 * The cases above caught all float32 values in the range
995 * [0, max_norm16 + max_step16), so this is the fall-through case.
996 */
997
998 /* } else { */
999
1000 assign(u16, constant(31u << 10u))))));
1001
1002 /* } */
1003
1004 return deref(u16).val;
1005 }
1006
1007 /**
1008 * \brief Lower a packHalf2x16 expression.
1009 *
1010 * \param vec2_rval is packHalf2x16's input
1011 * \return packHalf2x16's output as a uint rvalue
1012 */
1013 ir_rvalue*
1014 lower_pack_half_2x16(ir_rvalue *vec2_rval)
1015 {
1016 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1017 *
1018 * highp uint packHalf2x16 (mediump vec2 v)
1019 * ----------------------------------------
1020 * Returns an unsigned integer obtained by converting the components of
1021 * a two-component floating-point vector to the 16-bit floating-point
1022 * representation found in the OpenGL ES Specification, and then packing
1023 * these two 16-bit integers into a 32-bit unsigned integer.
1024 *
1025 * The first vector component specifies the 16 least- significant bits
1026 * of the result; the second component specifies the 16 most-significant
1027 * bits.
1028 */
1029
1030 assert(vec2_rval->type == glsl_type::vec2_type);
1031
1032 /* vec2 f = VEC2_RVAL; */
1033 ir_variable *f = factory.make_temp(glsl_type::vec2_type,
1034 "tmp_pack_half_2x16_f");
1035 factory.emit(assign(f, vec2_rval));
1036
1037 /* uvec2 f32 = bitcast_f2u(f); */
1038 ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1039 "tmp_pack_half_2x16_f32");
1040 factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1041
1042 /* uvec2 f16; */
1043 ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1044 "tmp_pack_half_2x16_f16");
1045
1046 /* Get f32's unshifted exponent bits.
1047 *
1048 * uvec2 e = f32 & 0x7f800000u;
1049 */
1050 ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1051 "tmp_pack_half_2x16_e");
1052 factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1053
1054 /* Get f32's unshifted mantissa bits.
1055 *
1056 * uvec2 m = f32 & 0x007fffffu;
1057 */
1058 ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1059 "tmp_pack_half_2x16_m");
1060 factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1061
1062 /* Set f16's exponent and mantissa bits.
1063 *
1064 * f16.x = pack_half_1x16_nosign(e.x, m.x);
1065 * f16.y = pack_half_1y16_nosign(e.y, m.y);
1066 */
1067 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1068 swizzle_x(e),
1069 swizzle_x(m)),
1070 WRITEMASK_X));
1071 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1072 swizzle_y(e),
1073 swizzle_y(m)),
1074 WRITEMASK_Y));
1075
1076 /* Set f16's sign bits.
1077 *
1078 * f16 |= (f32 & (1u << 31u) >> 16u;
1079 */
1080 factory.emit(
1081 assign(f16, bit_or(f16,
1082 rshift(bit_and(f32, constant(1u << 31u)),
1083 constant(16u)))));
1084
1085
1086 /* return (f16.y << 16u) | f16.x; */
1087 ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1088 constant(16u)),
1089 swizzle_x(f16));
1090
1091 assert(result->type == glsl_type::uint_type);
1092 return result;
1093 }
1094
1095 /**
1096 * \brief Split packHalf2x16's vec2 operand into two floats.
1097 *
1098 * \param vec2_rval is packHalf2x16's input
1099 * \return a uint rvalue
1100 *
1101 * Some code generators, such as the i965 fragment shader, require that all
1102 * vector expressions be lowered to a sequence of scalar expressions.
1103 * However, packHalf2x16 cannot be scalarized by the same mechanism as
1104 * a true vector operation because its input and output have a differing
1105 * number of vector components.
1106 *
1107 * This method scalarizes packHalf2x16 by transforming it from an unary
1108 * operation having vector input to a binary operation having scalar input.
1109 * That is, it transforms
1110 *
1111 * packHalf2x16(VEC2_RVAL);
1112 *
1113 * into
1114 *
1115 * vec2 v = VEC2_RVAL;
1116 * return packHalf2x16_split(v.x, v.y);
1117 */
1118 ir_rvalue*
1119 split_pack_half_2x16(ir_rvalue *vec2_rval)
1120 {
1121 assert(vec2_rval->type == glsl_type::vec2_type);
1122
1123 ir_variable *v = factory.make_temp(glsl_type::vec2_type,
1124 "tmp_split_pack_half_2x16_v");
1125 factory.emit(assign(v, vec2_rval));
1126
1127 return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
1128 }
1129
1130 /**
1131 * \brief Lower the component-wise calculation of unpackHalf2x16.
1132 *
1133 * Given a uint that encodes a float16 in its lower 16 bits, this function
1134 * returns a uint that encodes a float32 with the same value. The sign bit
1135 * of the float16 is ignored.
1136 *
1137 * \param e_rval is the unshifted exponent bits of a float16
1138 * \param m_rval is the unshifted mantissa bits of a float16
1139 * \param a uint rvalue that encodes a float32
1140 */
1141 ir_rvalue*
1142 unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1143 {
1144 assert(e_rval->type == glsl_type::uint_type);
1145 assert(m_rval->type == glsl_type::uint_type);
1146
1147 /* uint u32; */
1148 ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1149 "tmp_unpack_half_1x16_u32");
1150
1151 /* uint e = E_RVAL; */
1152 ir_variable *e = factory.make_temp(glsl_type::uint_type,
1153 "tmp_unpack_half_1x16_e");
1154 factory.emit(assign(e, e_rval));
1155
1156 /* uint m = M_RVAL; */
1157 ir_variable *m = factory.make_temp(glsl_type::uint_type,
1158 "tmp_unpack_half_1x16_m");
1159 factory.emit(assign(m, m_rval));
1160
1161 /* Preliminaries
1162 * -------------
1163 *
1164 * For a float16, the bit layout is:
1165 *
1166 * sign: 15
1167 * exponent: 10:14
1168 * mantissa: 0:9
1169 *
1170 * Let f16 be a float16 value. The sign, exponent, and mantissa
1171 * determine its value thus:
1172 *
1173 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1174 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1175 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1176 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1177 * if e16 = 31 and m16 != 0, then NaN (5)
1178 *
1179 * where 0 <= m16 < 2^10.
1180 *
1181 * For a float32, the bit layout is:
1182 *
1183 * sign: 31
1184 * exponent: 23:30
1185 * mantissa: 0:22
1186 *
1187 * Let f32 be a float32 value. The sign, exponent, and mantissa
1188 * determine its value thus:
1189 *
1190 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1191 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1192 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1193 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1194 * if e32 = 255 and m32 != 0, then NaN (14)
1195 *
1196 * where 0 <= m32 < 2^23.
1197 *
1198 * Calculation
1199 * -----------
1200 * Our task is to compute s32, e32, m32 given f16. Since this function
1201 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1202 * cases consider.
1203 */
1204
1205 factory.emit(
1206
1207 /* Case 1) f16 is zero or subnormal.
1208 *
1209 * The simplest method of calcuating f32 in this case is
1210 *
1211 * f32 = f16 (20)
1212 * = 2^(-14) * (m16 / 2^10) (21)
1213 * = m16 / 2^(-24) (22)
1214 */
1215
1216 /* if (e16 == 0) { */
1217 if_tree(equal(e, constant(0u)),
1218
1219 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1220 assign(u32, expr(ir_unop_bitcast_f2u,
1221 div(u2f(m), constant((float)(1 << 24))))),
1222
1223 /* Case 2) f16 is normal.
1224 *
1225 * The equation
1226 *
1227 * f32 = f16 (30)
1228 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1229 * 2^(e16 - 15) * (1 + m16 / 2^10)
1230 *
1231 * can be decomposed into two
1232 *
1233 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1234 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1235 *
1236 * which solve to
1237 *
1238 * e32 = e16 + 112 (34)
1239 * m32 = m16 * 2^13 (35)
1240 */
1241
1242 /* } else if (e16 < 31)) { */
1243 if_tree(less(e, constant(31u << 10u)),
1244
1245 /* u32 = ((e + (112 << 10)) | m) << 13;
1246 */
1247 assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1248 constant(13u))),
1249
1250
1251 /* Case 3) f16 is infinite. */
1252 if_tree(equal(m, constant(0u)),
1253
1254 assign(u32, constant(255u << 23u)),
1255
1256 /* Case 4) f16 is NaN. */
1257 /* } else { */
1258
1259 assign(u32, constant(0x7fffffffu))))));
1260
1261 /* } */
1262
1263 return deref(u32).val;
1264 }
1265
1266 /**
1267 * \brief Lower an unpackHalf2x16 expression.
1268 *
1269 * \param uint_rval is unpackHalf2x16's input
1270 * \return unpackHalf2x16's output as a vec2 rvalue
1271 */
1272 ir_rvalue*
1273 lower_unpack_half_2x16(ir_rvalue *uint_rval)
1274 {
1275 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1276 *
1277 * mediump vec2 unpackHalf2x16 (highp uint v)
1278 * ------------------------------------------
1279 * Returns a two-component floating-point vector with components
1280 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1281 * values, interpreting those values as 16-bit floating-point numbers
1282 * according to the OpenGL ES Specification, and converting them to
1283 * 32-bit floating-point values.
1284 *
1285 * The first component of the vector is obtained from the
1286 * 16 least-significant bits of v; the second component is obtained
1287 * from the 16 most-significant bits of v.
1288 */
1289 assert(uint_rval->type == glsl_type::uint_type);
1290
1291 /* uint u = RVALUE;
1292 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1293 */
1294 ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1295 "tmp_unpack_half_2x16_f16");
1296 factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1297
1298 /* uvec2 f32; */
1299 ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1300 "tmp_unpack_half_2x16_f32");
1301
1302 /* Get f16's unshifted exponent bits.
1303 *
1304 * uvec2 e = f16 & 0x7c00u;
1305 */
1306 ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1307 "tmp_unpack_half_2x16_e");
1308 factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1309
1310 /* Get f16's unshifted mantissa bits.
1311 *
1312 * uvec2 m = f16 & 0x03ffu;
1313 */
1314 ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1315 "tmp_unpack_half_2x16_m");
1316 factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1317
1318 /* Set f32's exponent and mantissa bits.
1319 *
1320 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1321 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1322 */
1323 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1324 swizzle_x(m)),
1325 WRITEMASK_X));
1326 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1327 swizzle_y(m)),
1328 WRITEMASK_Y));
1329
1330 /* Set f32's sign bit.
1331 *
1332 * f32 |= (f16 & 0x8000u) << 16u;
1333 */
1334 factory.emit(assign(f32, bit_or(f32,
1335 lshift(bit_and(f16,
1336 constant(0x8000u)),
1337 constant(16u)))));
1338
1339 /* return bitcast_u2f(f32); */
1340 ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1341 assert(result->type == glsl_type::vec2_type);
1342 return result;
1343 }
1344
1345 /**
1346 * \brief Split unpackHalf2x16 into two operations.
1347 *
1348 * \param uint_rval is unpackHalf2x16's input
1349 * \return a vec2 rvalue
1350 *
1351 * Some code generators, such as the i965 fragment shader, require that all
1352 * vector expressions be lowered to a sequence of scalar expressions.
1353 * However, unpackHalf2x16 cannot be scalarized by the same method as
1354 * a true vector operation because the number of components of its input
1355 * and output differ.
1356 *
1357 * This method scalarizes unpackHalf2x16 by transforming it from a single
1358 * operation having vec2 output to a pair of operations each having float
1359 * output. That is, it transforms
1360 *
1361 * unpackHalf2x16(UINT_RVAL)
1362 *
1363 * into
1364 *
1365 * uint u = UINT_RVAL;
1366 * vec2 v;
1367 *
1368 * v.x = unpackHalf2x16_split_x(u);
1369 * v.y = unpackHalf2x16_split_y(u);
1370 *
1371 * return v;
1372 */
1373 ir_rvalue*
1374 split_unpack_half_2x16(ir_rvalue *uint_rval)
1375 {
1376 assert(uint_rval->type == glsl_type::uint_type);
1377
1378 /* uint u = uint_rval; */
1379 ir_variable *u = factory.make_temp(glsl_type::uint_type,
1380 "tmp_split_unpack_half_2x16_u");
1381 factory.emit(assign(u, uint_rval));
1382
1383 /* vec2 v; */
1384 ir_variable *v = factory.make_temp(glsl_type::vec2_type,
1385 "tmp_split_unpack_half_2x16_v");
1386
1387 /* v.x = unpack_half_2x16_split_x(u); */
1388 factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
1389 WRITEMASK_X));
1390
1391 /* v.y = unpack_half_2x16_split_y(u); */
1392 factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
1393 WRITEMASK_Y));
1394
1395 return deref(v).val;
1396 }
1397 };
1398
1399 } // namespace anonymous
1400
1401 /**
1402 * \brief Lower the builtin packing functions.
1403 *
1404 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1405 */
1406 bool
1407 lower_packing_builtins(exec_list *instructions, int op_mask)
1408 {
1409 lower_packing_builtins_visitor v(op_mask);
1410 visit_list_elements(&v, instructions, true);
1411 return v.get_progress();
1412 }