src/compiler/glsl/lower_packing_builtins.cpp

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "ir.h"
  25 #include "ir_builder.h"
  26 #include "ir_optimization.h"
  27 #include "ir_rvalue_visitor.h"
  28
  29 namespace {
  30
  31 using namespace ir_builder;
  32
  33 /**
  34  * A visitor that lowers built-in floating-point pack/unpack expressions
  35  * such packSnorm2x16.
  36  */
  37 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
  38 public:
  39    /**
  40     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
  41     */
  42    explicit lower_packing_builtins_visitor(int op_mask)
  43       : op_mask(op_mask),
  44         progress(false)
  45    {
  46       factory.instructions = &factory_instructions;
  47    }
  48
  49    virtual ~lower_packing_builtins_visitor()
  50    {
  51       assert(factory_instructions.is_empty());
  52    }
  53
  54    bool get_progress() { return progress; }
  55
  56    void handle_rvalue(ir_rvalue **rvalue)
  57    {
  58       if (!*rvalue)
  59          return;
  60
  61       ir_expression *expr = (*rvalue)->as_expression();
  62       if (!expr)
  63          return;
  64
  65       enum lower_packing_builtins_op lowering_op =
  66          choose_lowering_op(expr->operation);
  67
  68       if (lowering_op == LOWER_PACK_UNPACK_NONE)
  69          return;
  70
  71       setup_factory(ralloc_parent(expr));
  72
  73       ir_rvalue *op0 = expr->operands[0];
  74       ralloc_steal(factory.mem_ctx, op0);
  75
  76       switch (lowering_op) {
  77       case LOWER_PACK_SNORM_2x16:
  78          *rvalue = lower_pack_snorm_2x16(op0);
  79          break;
  80       case LOWER_PACK_SNORM_4x8:
  81          *rvalue = lower_pack_snorm_4x8(op0);
  82          break;
  83       case LOWER_PACK_UNORM_2x16:
  84          *rvalue = lower_pack_unorm_2x16(op0);
  85          break;
  86       case LOWER_PACK_UNORM_4x8:
  87          *rvalue = lower_pack_unorm_4x8(op0);
  88          break;
  89       case LOWER_PACK_HALF_2x16:
  90          *rvalue = lower_pack_half_2x16(op0);
  91          break;
  92       case LOWER_UNPACK_SNORM_2x16:
  93          *rvalue = lower_unpack_snorm_2x16(op0);
  94          break;
  95       case LOWER_UNPACK_SNORM_4x8:
  96          *rvalue = lower_unpack_snorm_4x8(op0);
  97          break;
  98       case LOWER_UNPACK_UNORM_2x16:
  99          *rvalue = lower_unpack_unorm_2x16(op0);
 100          break;
 101       case LOWER_UNPACK_UNORM_4x8:
 102          *rvalue = lower_unpack_unorm_4x8(op0);
 103          break;
 104       case LOWER_UNPACK_HALF_2x16:
 105          *rvalue = lower_unpack_half_2x16(op0);
 106          break;
 107       case LOWER_PACK_UNPACK_NONE:
 108       case LOWER_PACK_USE_BFI:
 109       case LOWER_PACK_USE_BFE:
 110          assert(!"not reached");
 111          break;
 112       }
 113
 114       teardown_factory();
 115       progress = true;
 116    }
 117
 118 private:
 119    const int op_mask;
 120    bool progress;
 121    ir_factory factory;
 122    exec_list factory_instructions;
 123
 124    /**
 125     * Determine the needed lowering operation by filtering \a expr_op
 126     * through \ref op_mask.
 127     */
 128    enum lower_packing_builtins_op
 129    choose_lowering_op(ir_expression_operation expr_op)
 130    {
 131       /* C++ regards int and enum as fundamentally different types.
 132        * So, we can't simply return from each case; we must cast the return
 133        * value.
 134        */
 135       int result;
 136
 137       switch (expr_op) {
 138       case ir_unop_pack_snorm_2x16:
 139          result = op_mask & LOWER_PACK_SNORM_2x16;
 140          break;
 141       case ir_unop_pack_snorm_4x8:
 142          result = op_mask & LOWER_PACK_SNORM_4x8;
 143          break;
 144       case ir_unop_pack_unorm_2x16:
 145          result = op_mask & LOWER_PACK_UNORM_2x16;
 146          break;
 147       case ir_unop_pack_unorm_4x8:
 148          result = op_mask & LOWER_PACK_UNORM_4x8;
 149          break;
 150       case ir_unop_pack_half_2x16:
 151          result = op_mask & LOWER_PACK_HALF_2x16;
 152          break;
 153       case ir_unop_unpack_snorm_2x16:
 154          result = op_mask & LOWER_UNPACK_SNORM_2x16;
 155          break;
 156       case ir_unop_unpack_snorm_4x8:
 157          result = op_mask & LOWER_UNPACK_SNORM_4x8;
 158          break;
 159       case ir_unop_unpack_unorm_2x16:
 160          result = op_mask & LOWER_UNPACK_UNORM_2x16;
 161          break;
 162       case ir_unop_unpack_unorm_4x8:
 163          result = op_mask & LOWER_UNPACK_UNORM_4x8;
 164          break;
 165       case ir_unop_unpack_half_2x16:
 166          result = op_mask & LOWER_UNPACK_HALF_2x16;
 167          break;
 168       default:
 169          result = LOWER_PACK_UNPACK_NONE;
 170          break;
 171       }
 172
 173       return static_cast<enum lower_packing_builtins_op>(result);
 174    }
 175
 176    void
 177    setup_factory(void *mem_ctx)
 178    {
 179       assert(factory.mem_ctx == NULL);
 180       assert(factory.instructions->is_empty());
 181
 182       factory.mem_ctx = mem_ctx;
 183    }
 184
 185    void
 186    teardown_factory()
 187    {
 188       base_ir->insert_before(factory.instructions);
 189       assert(factory.instructions->is_empty());
 190       factory.mem_ctx = NULL;
 191    }
 192
 193    template <typename T>
 194    ir_constant*
 195    constant(T x)
 196    {
 197       return factory.constant(x);
 198    }
 199
 200    /**
 201     * \brief Pack two uint16's into a single uint32.
 202     *
 203     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
 204     * where the least significant bits specify the first element of the pair.
 205     * Return the uint32.
 206     */
 207    ir_rvalue*
 208    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
 209    {
 210       assert(uvec2_rval->type == glsl_type::uvec2_type);
 211
 212       /* uvec2 u = UVEC2_RVAL; */
 213       ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
 214                                          "tmp_pack_uvec2_to_uint");
 215       factory.emit(assign(u, uvec2_rval));
 216
 217       if (op_mask & LOWER_PACK_USE_BFI) {
 218          return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
 219                                 swizzle_y(u),
 220                                 constant(16u),
 221                                 constant(16u));
 222       }
 223
 224       /* return (u.y << 16) | (u.x & 0xffff); */
 225       return bit_or(lshift(swizzle_y(u), constant(16u)),
 226                     bit_and(swizzle_x(u), constant(0xffffu)));
 227    }
 228
 229    /**
 230     * \brief Pack four uint8's into a single uint32.
 231     *
 232     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
 233     * uint32 where the least significant bits specify the first element of the
 234     * 4-tuple. Return the uint32.
 235     */
 236    ir_rvalue*
 237    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
 238    {
 239       assert(uvec4_rval->type == glsl_type::uvec4_type);
 240
 241       ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
 242                                          "tmp_pack_uvec4_to_uint");
 243
 244       if (op_mask & LOWER_PACK_USE_BFI) {
 245          /* uvec4 u = UVEC4_RVAL; */
 246          factory.emit(assign(u, uvec4_rval));
 247
 248          return bitfield_insert(bitfield_insert(
 249                                    bitfield_insert(
 250                                       bit_and(swizzle_x(u), constant(0xffu)),
 251                                       swizzle_y(u), constant(8u), constant(8u)),
 252                                    swizzle_z(u), constant(16u), constant(8u)),
 253                                 swizzle_w(u), constant(24u), constant(8u));
 254       }
 255
 256       /* uvec4 u = UVEC4_RVAL & 0xff */
 257       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
 258
 259       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
 260       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
 261                            lshift(swizzle_z(u), constant(16u))),
 262                     bit_or(lshift(swizzle_y(u), constant(8u)),
 263                            swizzle_x(u)));
 264    }
 265
 266    /**
 267     * \brief Unpack a uint32 into two uint16's.
 268     *
 269     * Interpret the given uint32 as a uint16 pair where the uint32's least
 270     * significant bits specify the pair's first element. Return the uint16
 271     * pair as a uvec2.
 272     */
 273    ir_rvalue*
 274    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
 275    {
 276       assert(uint_rval->type == glsl_type::uint_type);
 277
 278       /* uint u = UINT_RVAL; */
 279       ir_variable *u = factory.make_temp(glsl_type::uint_type,
 280                                           "tmp_unpack_uint_to_uvec2_u");
 281       factory.emit(assign(u, uint_rval));
 282
 283       /* uvec2 u2; */
 284       ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
 285                                            "tmp_unpack_uint_to_uvec2_u2");
 286
 287       /* u2.x = u & 0xffffu; */
 288       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
 289
 290       /* u2.y = u >> 16u; */
 291       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
 292
 293       return deref(u2).val;
 294    }
 295
 296    /**
 297     * \brief Unpack a uint32 into two int16's.
 298     *
 299     * Specifically each 16-bit value is sign-extended to the full width of an
 300     * int32 on return.
 301     */
 302    ir_rvalue *
 303    unpack_uint_to_ivec2(ir_rvalue *uint_rval)
 304    {
 305       assert(uint_rval->type == glsl_type::uint_type);
 306
 307       if (!(op_mask & LOWER_PACK_USE_BFE)) {
 308          return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
 309                               constant(16u)),
 310                        constant(16u));
 311       }
 312
 313       ir_variable *i = factory.make_temp(glsl_type::int_type,
 314                                          "tmp_unpack_uint_to_ivec2_i");
 315       factory.emit(assign(i, u2i(uint_rval)));
 316
 317       /* ivec2 i2; */
 318       ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
 319                                           "tmp_unpack_uint_to_ivec2_i2");
 320
 321       factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
 322                           WRITEMASK_X));
 323       factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
 324                           WRITEMASK_Y));
 325
 326       return deref(i2).val;
 327    }
 328
 329    /**
 330     * \brief Unpack a uint32 into four uint8's.
 331     *
 332     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
 333     * significant bits specify the 4-tuple's first element. Return the uint8
 334     * 4-tuple as a uvec4.
 335     */
 336    ir_rvalue*
 337    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
 338    {
 339       assert(uint_rval->type == glsl_type::uint_type);
 340
 341       /* uint u = UINT_RVAL; */
 342       ir_variable *u = factory.make_temp(glsl_type::uint_type,
 343                                           "tmp_unpack_uint_to_uvec4_u");
 344       factory.emit(assign(u, uint_rval));
 345
 346       /* uvec4 u4; */
 347       ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
 348                                            "tmp_unpack_uint_to_uvec4_u4");
 349
 350       /* u4.x = u & 0xffu; */
 351       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
 352
 353       if (op_mask & LOWER_PACK_USE_BFE) {
 354          /* u4.y = bitfield_extract(u, 8, 8); */
 355          factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
 356                              WRITEMASK_Y));
 357
 358          /* u4.z = bitfield_extract(u, 16, 8); */
 359          factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
 360                              WRITEMASK_Z));
 361       } else {
 362          /* u4.y = (u >> 8u) & 0xffu; */
 363          factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
 364                                          constant(0xffu)), WRITEMASK_Y));
 365
 366          /* u4.z = (u >> 16u) & 0xffu; */
 367          factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
 368                                          constant(0xffu)), WRITEMASK_Z));
 369       }
 370
 371       /* u4.w = (u >> 24u) */
 372       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
 373
 374       return deref(u4).val;
 375    }
 376
 377    /**
 378     * \brief Unpack a uint32 into four int8's.
 379     *
 380     * Specifically each 8-bit value is sign-extended to the full width of an
 381     * int32 on return.
 382     */
 383    ir_rvalue *
 384    unpack_uint_to_ivec4(ir_rvalue *uint_rval)
 385    {
 386       assert(uint_rval->type == glsl_type::uint_type);
 387
 388       if (!(op_mask & LOWER_PACK_USE_BFE)) {
 389          return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
 390                               constant(24u)),
 391                        constant(24u));
 392       }
 393
 394       ir_variable *i = factory.make_temp(glsl_type::int_type,
 395                                          "tmp_unpack_uint_to_ivec4_i");
 396       factory.emit(assign(i, u2i(uint_rval)));
 397
 398       /* ivec4 i4; */
 399       ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
 400                                           "tmp_unpack_uint_to_ivec4_i4");
 401
 402       factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
 403                           WRITEMASK_X));
 404       factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
 405                           WRITEMASK_Y));
 406       factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
 407                           WRITEMASK_Z));
 408       factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
 409                           WRITEMASK_W));
 410
 411       return deref(i4).val;
 412    }
 413
 414    /**
 415     * \brief Lower a packSnorm2x16 expression.
 416     *
 417     * \param vec2_rval is packSnorm2x16's input
 418     * \return packSnorm2x16's output as a uint rvalue
 419     */
 420    ir_rvalue*
 421    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
 422    {
 423       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 424        *
 425        *    highp uint packSnorm2x16(vec2 v)
 426        *    --------------------------------
 427        *    First, converts each component of the normalized floating-point value
 428        *    v into 16-bit integer values. Then, the results are packed into the
 429        *    returned 32-bit unsigned integer.
 430        *
 431        *    The conversion for component c of v to fixed point is done as
 432        *    follows:
 433        *
 434        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
 435        *
 436        *    The first component of the vector will be written to the least
 437        *    significant bits of the output; the last component will be written to
 438        *    the most significant bits.
 439        *
 440        * This function generates IR that approximates the following pseudo-GLSL:
 441        *
 442        *     return pack_uvec2_to_uint(
 443        *         uvec2(ivec2(
 444        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
 445        *
 446        * It is necessary to first convert the vec2 to ivec2 rather than directly
 447        * converting vec2 to uvec2 because the latter conversion is undefined.
 448        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
 449        * convert a negative floating point value to an uint".
 450        */
 451       assert(vec2_rval->type == glsl_type::vec2_type);
 452
 453       ir_rvalue *result = pack_uvec2_to_uint(
 454             i2u(f2i(round_even(mul(clamp(vec2_rval,
 455                                          constant(-1.0f),
 456                                          constant(1.0f)),
 457                                    constant(32767.0f))))));
 458
 459       assert(result->type == glsl_type::uint_type);
 460       return result;
 461    }
 462
 463    /**
 464     * \brief Lower a packSnorm4x8 expression.
 465     *
 466     * \param vec4_rval is packSnorm4x8's input
 467     * \return packSnorm4x8's output as a uint rvalue
 468     */
 469    ir_rvalue*
 470    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
 471    {
 472       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 473        *
 474        *    highp uint packSnorm4x8(vec4 v)
 475        *    -------------------------------
 476        *    First, converts each component of the normalized floating-point value
 477        *    v into 8-bit integer values. Then, the results are packed into the
 478        *    returned 32-bit unsigned integer.
 479        *
 480        *    The conversion for component c of v to fixed point is done as
 481        *    follows:
 482        *
 483        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
 484        *
 485        *    The first component of the vector will be written to the least
 486        *    significant bits of the output; the last component will be written to
 487        *    the most significant bits.
 488        *
 489        * This function generates IR that approximates the following pseudo-GLSL:
 490        *
 491        *     return pack_uvec4_to_uint(
 492        *         uvec4(ivec4(
 493        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
 494        *
 495        * It is necessary to first convert the vec4 to ivec4 rather than directly
 496        * converting vec4 to uvec4 because the latter conversion is undefined.
 497        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
 498        * convert a negative floating point value to an uint".
 499        */
 500       assert(vec4_rval->type == glsl_type::vec4_type);
 501
 502       ir_rvalue *result = pack_uvec4_to_uint(
 503             i2u(f2i(round_even(mul(clamp(vec4_rval,
 504                                          constant(-1.0f),
 505                                          constant(1.0f)),
 506                                    constant(127.0f))))));
 507
 508       assert(result->type == glsl_type::uint_type);
 509       return result;
 510    }
 511
 512    /**
 513     * \brief Lower an unpackSnorm2x16 expression.
 514     *
 515     * \param uint_rval is unpackSnorm2x16's input
 516     * \return unpackSnorm2x16's output as a vec2 rvalue
 517     */
 518    ir_rvalue*
 519    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
 520    {
 521       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 522        *
 523        *    highp vec2 unpackSnorm2x16 (highp uint p)
 524        *    -----------------------------------------
 525        *    First, unpacks a single 32-bit unsigned integer p into a pair of
 526        *    16-bit unsigned integers. Then, each component is converted to
 527        *    a normalized floating-point value to generate the returned
 528        *    two-component vector.
 529        *
 530        *    The conversion for unpacked fixed-point value f to floating point is
 531        *    done as follows:
 532        *
 533        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
 534        *
 535        *    The first component of the returned vector will be extracted from the
 536        *    least significant bits of the input; the last component will be
 537        *    extracted from the most significant bits.
 538        *
 539        * This function generates IR that approximates the following pseudo-GLSL:
 540        *
 541        *    return clamp(
 542        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
 543        *       -1.0f, 1.0f);
 544        *
 545        * The above IR may appear unnecessarily complex, but the intermediate
 546        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
 547        * negative floats.
 548        *
 549        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
 550        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
 551        * place that int16 into an int32, which results in the *positive* integer
 552        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
 553        * unimportant bit 16. We must now extend the int16's sign bit into bits
 554        * 17-32, which is accomplished by left-shifting then right-shifting.
 555        */
 556
 557       assert(uint_rval->type == glsl_type::uint_type);
 558
 559       ir_rvalue *result =
 560         clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
 561                   constant(32767.0f)),
 562               constant(-1.0f),
 563               constant(1.0f));
 564
 565       assert(result->type == glsl_type::vec2_type);
 566       return result;
 567    }
 568
 569    /**
 570     * \brief Lower an unpackSnorm4x8 expression.
 571     *
 572     * \param uint_rval is unpackSnorm4x8's input
 573     * \return unpackSnorm4x8's output as a vec4 rvalue
 574     */
 575    ir_rvalue*
 576    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
 577    {
 578       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 579        *
 580        *    highp vec4 unpackSnorm4x8 (highp uint p)
 581        *    ----------------------------------------
 582        *    First, unpacks a single 32-bit unsigned integer p into four
 583        *    8-bit unsigned integers. Then, each component is converted to
 584        *    a normalized floating-point value to generate the returned
 585        *    four-component vector.
 586        *
 587        *    The conversion for unpacked fixed-point value f to floating point is
 588        *    done as follows:
 589        *
 590        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
 591        *
 592        *    The first component of the returned vector will be extracted from the
 593        *    least significant bits of the input; the last component will be
 594        *    extracted from the most significant bits.
 595        *
 596        * This function generates IR that approximates the following pseudo-GLSL:
 597        *
 598        *    return clamp(
 599        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
 600        *       -1.0f, 1.0f);
 601        *
 602        * The above IR may appear unnecessarily complex, but the intermediate
 603        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
 604        * negative floats.
 605        *
 606        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
 607        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
 608        * place that int8 into an int32, which results in the *positive* integer
 609        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
 610        * unimportant bit 8. We must now extend the int8's sign bit into bits
 611        * 9-32, which is accomplished by left-shifting then right-shifting.
 612        */
 613
 614       assert(uint_rval->type == glsl_type::uint_type);
 615
 616       ir_rvalue *result =
 617         clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
 618                   constant(127.0f)),
 619               constant(-1.0f),
 620               constant(1.0f));
 621
 622       assert(result->type == glsl_type::vec4_type);
 623       return result;
 624    }
 625
 626    /**
 627     * \brief Lower a packUnorm2x16 expression.
 628     *
 629     * \param vec2_rval is packUnorm2x16's input
 630     * \return packUnorm2x16's output as a uint rvalue
 631     */
 632    ir_rvalue*
 633    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
 634    {
 635       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 636        *
 637        *    highp uint packUnorm2x16 (vec2 v)
 638        *    ---------------------------------
 639        *    First, converts each component of the normalized floating-point value
 640        *    v into 16-bit integer values. Then, the results are packed into the
 641        *    returned 32-bit unsigned integer.
 642        *
 643        *    The conversion for component c of v to fixed point is done as
 644        *    follows:
 645        *
 646        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
 647        *
 648        *    The first component of the vector will be written to the least
 649        *    significant bits of the output; the last component will be written to
 650        *    the most significant bits.
 651        *
 652        * This function generates IR that approximates the following pseudo-GLSL:
 653        *
 654        *     return pack_uvec2_to_uint(uvec2(
 655        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
 656        *
 657        * Here it is safe to directly convert the vec2 to uvec2 because the vec2
 658        * has been clamped to a non-negative range.
 659        */
 660
 661       assert(vec2_rval->type == glsl_type::vec2_type);
 662
 663       ir_rvalue *result = pack_uvec2_to_uint(
 664          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
 665
 666       assert(result->type == glsl_type::uint_type);
 667       return result;
 668    }
 669
 670    /**
 671     * \brief Lower a packUnorm4x8 expression.
 672     *
 673     * \param vec4_rval is packUnorm4x8's input
 674     * \return packUnorm4x8's output as a uint rvalue
 675     */
 676    ir_rvalue*
 677    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
 678    {
 679       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 680        *
 681        *    highp uint packUnorm4x8 (vec4 v)
 682        *    --------------------------------
 683        *    First, converts each component of the normalized floating-point value
 684        *    v into 8-bit integer values. Then, the results are packed into the
 685        *    returned 32-bit unsigned integer.
 686        *
 687        *    The conversion for component c of v to fixed point is done as
 688        *    follows:
 689        *
 690        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
 691        *
 692        *    The first component of the vector will be written to the least
 693        *    significant bits of the output; the last component will be written to
 694        *    the most significant bits.
 695        *
 696        * This function generates IR that approximates the following pseudo-GLSL:
 697        *
 698        *     return pack_uvec4_to_uint(uvec4(
 699        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
 700        *
 701        * Here it is safe to directly convert the vec4 to uvec4 because the vec4
 702        * has been clamped to a non-negative range.
 703        */
 704
 705       assert(vec4_rval->type == glsl_type::vec4_type);
 706
 707       ir_rvalue *result = pack_uvec4_to_uint(
 708          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
 709
 710       assert(result->type == glsl_type::uint_type);
 711       return result;
 712    }
 713
 714    /**
 715     * \brief Lower an unpackUnorm2x16 expression.
 716     *
 717     * \param uint_rval is unpackUnorm2x16's input
 718     * \return unpackUnorm2x16's output as a vec2 rvalue
 719     */
 720    ir_rvalue*
 721    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
 722    {
 723       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
 724        *
 725        *    highp vec2 unpackUnorm2x16 (highp uint p)
 726        *    -----------------------------------------
 727        *    First, unpacks a single 32-bit unsigned integer p into a pair of
 728        *    16-bit unsigned integers. Then, each component is converted to
 729        *    a normalized floating-point value to generate the returned
 730        *    two-component vector.
 731        *
 732        *    The conversion for unpacked fixed-point value f to floating point is
 733        *    done as follows:
 734        *
 735        *       unpackUnorm2x16: f / 65535.0
 736        *
 737        *    The first component of the returned vector will be extracted from the
 738        *    least significant bits of the input; the last component will be
 739        *    extracted from the most significant bits.
 740        *
 741        * This function generates IR that approximates the following pseudo-GLSL:
 742        *
 743        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
 744        */
 745
 746       assert(uint_rval->type == glsl_type::uint_type);
 747
 748       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
 749                               constant(65535.0f));
 750
 751       assert(result->type == glsl_type::vec2_type);
 752       return result;
 753    }
 754
 755    /**
 756     * \brief Lower an unpackUnorm4x8 expression.
 757     *
 758     * \param uint_rval is unpackUnorm4x8's input
 759     * \return unpackUnorm4x8's output as a vec4 rvalue
 760     */
 761    ir_rvalue*
 762    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
 763    {
 764       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 765        *
 766        *    highp vec4 unpackUnorm4x8 (highp uint p)
 767        *    ----------------------------------------
 768        *    First, unpacks a single 32-bit unsigned integer p into four
 769        *    8-bit unsigned integers. Then, each component is converted to
 770        *    a normalized floating-point value to generate the returned
 771        *    two-component vector.
 772        *
 773        *    The conversion for unpacked fixed-point value f to floating point is
 774        *    done as follows:
 775        *
 776        *       unpackUnorm4x8: f / 255.0
 777        *
 778        *    The first component of the returned vector will be extracted from the
 779        *    least significant bits of the input; the last component will be
 780        *    extracted from the most significant bits.
 781        *
 782        * This function generates IR that approximates the following pseudo-GLSL:
 783        *
 784        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
 785        */
 786
 787       assert(uint_rval->type == glsl_type::uint_type);
 788
 789       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
 790                               constant(255.0f));
 791
 792       assert(result->type == glsl_type::vec4_type);
 793       return result;
 794    }
 795
 796    /**
 797     * \brief Lower the component-wise calculation of packHalf2x16.
 798     *
 799     * \param f_rval is one component of packHafl2x16's input
 800     * \param e_rval is the unshifted exponent bits of f_rval
 801     * \param m_rval is the unshifted mantissa bits of f_rval
 802     *
 803     * \return a uint rvalue that encodes a float16 in its lower 16 bits
 804     */
 805    ir_rvalue*
 806    pack_half_1x16_nosign(ir_rvalue *f_rval,
 807                          ir_rvalue *e_rval,
 808                          ir_rvalue *m_rval)
 809    {
 810       assert(e_rval->type == glsl_type::uint_type);
 811       assert(m_rval->type == glsl_type::uint_type);
 812
 813       /* uint u16; */
 814       ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
 815                                            "tmp_pack_half_1x16_u16");
 816
 817       /* float f = FLOAT_RVAL; */
 818       ir_variable *f = factory.make_temp(glsl_type::float_type,
 819                                           "tmp_pack_half_1x16_f");
 820       factory.emit(assign(f, f_rval));
 821
 822       /* uint e = E_RVAL; */
 823       ir_variable *e = factory.make_temp(glsl_type::uint_type,
 824                                           "tmp_pack_half_1x16_e");
 825       factory.emit(assign(e, e_rval));
 826
 827       /* uint m = M_RVAL; */
 828       ir_variable *m = factory.make_temp(glsl_type::uint_type,
 829                                           "tmp_pack_half_1x16_m");
 830       factory.emit(assign(m, m_rval));
 831
 832       /* Preliminaries
 833        * -------------
 834        *
 835        * For a float16, the bit layout is:
 836        *
 837        *   sign:     15
 838        *   exponent: 10:14
 839        *   mantissa: 0:9
 840        *
 841        * Let f16 be a float16 value. The sign, exponent, and mantissa
 842        * determine its value thus:
 843        *
 844        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
 845        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
 846        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
 847        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
 848        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
 849        *
 850        * where 0 <= m16 < 2^10.
 851        *
 852        * For a float32, the bit layout is:
 853        *
 854        *   sign:     31
 855        *   exponent: 23:30
 856        *   mantissa: 0:22
 857        *
 858        * Let f32 be a float32 value. The sign, exponent, and mantissa
 859        * determine its value thus:
 860        *
 861        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
 862        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
 863        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
 864        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
 865        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
 866        *
 867        * where 0 <= m32 < 2^23.
 868        *
 869        * The minimum and maximum normal float16 values are
 870        *
 871        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
 872        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
 873        *
 874        * The step at max_norm16 is
 875        *
 876        *   max_step16 = 2^5                                     (22)
 877        *
 878        * Observe that the float16 boundary values in equations 20-21 lie in the
 879        * range of normal float32 values.
 880        *
 881        *
 882        * Rounding Behavior
 883        * -----------------
 884        * Not all float32 values can be exactly represented as a float16. We
 885        * round all such intermediate float32 values to the nearest float16; if
 886        * the float32 is exactly between to float16 values, we round to the one
 887        * with an even mantissa. This rounding behavior has several benefits:
 888        *
 889        *   - It has no sign bias.
 890        *
 891        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
 892        *     GPU ISA.
 893        *
 894        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
 895        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
 896        *     result in the same value as if the expression were executed on the
 897        *     GPU.
 898        *
 899        * Calculation
 900        * -----------
 901        * Our task is to compute s16, e16, m16 given f32.  Since this function
 902        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
 903        * cases consider.
 904        */
 905
 906       factory.emit(
 907
 908          /* Case 1) f32 is NaN
 909           *
 910           *   The resultant f16 will also be NaN.
 911           */
 912
 913          /* if (e32 == 255 && m32 != 0) { */
 914          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
 915                            logic_not(equal(m, constant(0u)))),
 916
 917             assign(u16, constant(0x7fffu)),
 918
 919          /* Case 2) f32 lies in the range [0, min_norm16).
 920           *
 921           *   The resultant float16 will be either zero, subnormal, or normal.
 922           *
 923           *   Solving
 924           *
 925           *     f32 = min_norm16       (30)
 926           *
 927           *   gives
 928           *
 929           *     e32 = 113 and m32 = 0  (31)
 930           *
 931           *   Therefore this case occurs if and only if
 932           *
 933           *     e32 < 113              (32)
 934           */
 935
 936          /* } else if (e32 < 113) { */
 937          if_tree(less(e, constant(113u << 23u)),
 938
 939             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
 940             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
 941                                            constant((float) (1 << 24)))))),
 942
 943          /* Case 3) f32 lies in the range
 944           *         [min_norm16, max_norm16 + max_step16).
 945           *
 946           *   The resultant float16 will be either normal or infinite.
 947           *
 948           *   Solving
 949           *
 950           *     f32 = max_norm16 + max_step16           (40)
 951           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
 952           *         = 2^16                              (42)
 953           *   gives
 954           *
 955           *     e32 = 143 and m32 = 0                   (43)
 956           *
 957           *   We already solved the boundary condition f32 = min_norm16 above
 958           *   in equation 31. Therefore this case occurs if and only if
 959           *
 960           *     113 <= e32 and e32 < 143
 961           */
 962
 963          /* } else if (e32 < 143) { */
 964          if_tree(less(e, constant(143u << 23u)),
 965
 966             /* The addition below handles the case where the mantissa rounds
 967              * up to 1024 and bumps the exponent.
 968              *
 969              * u16 = ((e - (112u << 23u)) >> 13u)
 970              *     + round_to_even((float(m) / (1u << 13u));
 971              */
 972             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
 973                                    constant(13u)),
 974                             f2u(round_even(
 975                                   div(u2f(m), constant((float) (1 << 13))))))),
 976
 977          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
 978           *
 979           *   The resultant float16 will be infinite.
 980           *
 981           *   The cases above caught all float32 values in the range
 982           *   [0, max_norm16 + max_step16), so this is the fall-through case.
 983           */
 984
 985          /* } else { */
 986
 987             assign(u16, constant(31u << 10u))))));
 988
 989          /* } */
 990
 991        return deref(u16).val;
 992    }
 993
 994    /**
 995     * \brief Lower a packHalf2x16 expression.
 996     *
 997     * \param vec2_rval is packHalf2x16's input
 998     * \return packHalf2x16's output as a uint rvalue
 999     */
1000    ir_rvalue*
1001    lower_pack_half_2x16(ir_rvalue *vec2_rval)
1002    {
1003       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1004        *
1005        *    highp uint packHalf2x16 (mediump vec2 v)
1006        *    ----------------------------------------
1007        *    Returns an unsigned integer obtained by converting the components of
1008        *    a two-component floating-point vector to the 16-bit floating-point
1009        *    representation found in the OpenGL ES Specification, and then packing
1010        *    these two 16-bit integers into a 32-bit unsigned integer.
1011        *
1012        *    The first vector component specifies the 16 least- significant bits
1013        *    of the result; the second component specifies the 16 most-significant
1014        *    bits.
1015        */
1016
1017       assert(vec2_rval->type == glsl_type::vec2_type);
1018
1019       /* vec2 f = VEC2_RVAL; */
1020       ir_variable *f = factory.make_temp(glsl_type::vec2_type,
1021                                          "tmp_pack_half_2x16_f");
1022       factory.emit(assign(f, vec2_rval));
1023
1024       /* uvec2 f32 = bitcast_f2u(f); */
1025       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1026                                             "tmp_pack_half_2x16_f32");
1027       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1028
1029       /* uvec2 f16; */
1030       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1031                                         "tmp_pack_half_2x16_f16");
1032
1033       /* Get f32's unshifted exponent bits.
1034        *
1035        *   uvec2 e = f32 & 0x7f800000u;
1036        */
1037       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1038                                           "tmp_pack_half_2x16_e");
1039       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1040
1041       /* Get f32's unshifted mantissa bits.
1042        *
1043        *   uvec2 m = f32 & 0x007fffffu;
1044        */
1045       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1046                                           "tmp_pack_half_2x16_m");
1047       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1048
1049       /* Set f16's exponent and mantissa bits.
1050        *
1051        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
1052        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
1053        */
1054       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1055                                                      swizzle_x(e),
1056                                                      swizzle_x(m)),
1057                            WRITEMASK_X));
1058       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1059                                                      swizzle_y(e),
1060                                                      swizzle_y(m)),
1061                            WRITEMASK_Y));
1062
1063       /* Set f16's sign bits.
1064        *
1065        *   f16 |= (f32 & (1u << 31u) >> 16u;
1066        */
1067       factory.emit(
1068          assign(f16, bit_or(f16,
1069                             rshift(bit_and(f32, constant(1u << 31u)),
1070                                    constant(16u)))));
1071
1072
1073       /* return (f16.y << 16u) | f16.x; */
1074       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1075                                         constant(16u)),
1076                                  swizzle_x(f16));
1077
1078       assert(result->type == glsl_type::uint_type);
1079       return result;
1080    }
1081
1082    /**
1083     * \brief Lower the component-wise calculation of unpackHalf2x16.
1084     *
1085     * Given a uint that encodes a float16 in its lower 16 bits, this function
1086     * returns a uint that encodes a float32 with the same value. The sign bit
1087     * of the float16 is ignored.
1088     *
1089     * \param e_rval is the unshifted exponent bits of a float16
1090     * \param m_rval is the unshifted mantissa bits of a float16
1091     * \param a uint rvalue that encodes a float32
1092     */
1093    ir_rvalue*
1094    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1095    {
1096       assert(e_rval->type == glsl_type::uint_type);
1097       assert(m_rval->type == glsl_type::uint_type);
1098
1099       /* uint u32; */
1100       ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1101                                            "tmp_unpack_half_1x16_u32");
1102
1103       /* uint e = E_RVAL; */
1104       ir_variable *e = factory.make_temp(glsl_type::uint_type,
1105                                           "tmp_unpack_half_1x16_e");
1106       factory.emit(assign(e, e_rval));
1107
1108       /* uint m = M_RVAL; */
1109       ir_variable *m = factory.make_temp(glsl_type::uint_type,
1110                                           "tmp_unpack_half_1x16_m");
1111       factory.emit(assign(m, m_rval));
1112
1113       /* Preliminaries
1114        * -------------
1115        *
1116        * For a float16, the bit layout is:
1117        *
1118        *   sign:     15
1119        *   exponent: 10:14
1120        *   mantissa: 0:9
1121        *
1122        * Let f16 be a float16 value. The sign, exponent, and mantissa
1123        * determine its value thus:
1124        *
1125        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1126        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1127        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1128        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1129        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1130        *
1131        * where 0 <= m16 < 2^10.
1132        *
1133        * For a float32, the bit layout is:
1134        *
1135        *   sign: 31
1136        *   exponent: 23:30
1137        *   mantissa: 0:22
1138        *
1139        * Let f32 be a float32 value. The sign, exponent, and mantissa
1140        * determine its value thus:
1141        *
1142        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1143        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1144        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1145        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1146        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1147        *
1148        * where 0 <= m32 < 2^23.
1149        *
1150        * Calculation
1151        * -----------
1152        * Our task is to compute s32, e32, m32 given f16.  Since this function
1153        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1154        * cases consider.
1155        */
1156
1157       factory.emit(
1158
1159          /* Case 1) f16 is zero or subnormal.
1160           *
1161           *   The simplest method of calcuating f32 in this case is
1162           *
1163           *     f32 = f16                       (20)
1164           *         = 2^(-14) * (m16 / 2^10)    (21)
1165           *         = m16 / 2^(-24)             (22)
1166           */
1167
1168          /* if (e16 == 0) { */
1169          if_tree(equal(e, constant(0u)),
1170
1171             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1172             assign(u32, expr(ir_unop_bitcast_f2u,
1173                                 div(u2f(m), constant((float)(1 << 24))))),
1174
1175          /* Case 2) f16 is normal.
1176           *
1177           *   The equation
1178           *
1179           *     f32 = f16                              (30)
1180           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1181           *       2^(e16 - 15) * (1 + m16 / 2^10)
1182           *
1183           *   can be decomposed into two
1184           *
1185           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1186           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1187           *
1188           *   which solve to
1189           *
1190           *     e32 = e16 + 112                        (34)
1191           *     m32 = m16 * 2^13                       (35)
1192           */
1193
1194          /* } else if (e16 < 31)) { */
1195          if_tree(less(e, constant(31u << 10u)),
1196
1197               /* u32 = ((e + (112 << 10)) | m) << 13;
1198                */
1199               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1200                                  constant(13u))),
1201
1202
1203          /* Case 3) f16 is infinite. */
1204          if_tree(equal(m, constant(0u)),
1205
1206                  assign(u32, constant(255u << 23u)),
1207
1208          /* Case 4) f16 is NaN. */
1209          /* } else { */
1210
1211             assign(u32, constant(0x7fffffffu))))));
1212
1213          /* } */
1214
1215       return deref(u32).val;
1216    }
1217
1218    /**
1219     * \brief Lower an unpackHalf2x16 expression.
1220     *
1221     * \param uint_rval is unpackHalf2x16's input
1222     * \return unpackHalf2x16's output as a vec2 rvalue
1223     */
1224    ir_rvalue*
1225    lower_unpack_half_2x16(ir_rvalue *uint_rval)
1226    {
1227       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1228        *
1229        *    mediump vec2 unpackHalf2x16 (highp uint v)
1230        *    ------------------------------------------
1231        *    Returns a two-component floating-point vector with components
1232        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1233        *    values, interpreting those values as 16-bit floating-point numbers
1234        *    according to the OpenGL ES Specification, and converting them to
1235        *    32-bit floating-point values.
1236        *
1237        *    The first component of the vector is obtained from the
1238        *    16 least-significant bits of v; the second component is obtained
1239        *    from the 16 most-significant bits of v.
1240        */
1241       assert(uint_rval->type == glsl_type::uint_type);
1242
1243       /* uint u = RVALUE;
1244        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1245        */
1246       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1247                                             "tmp_unpack_half_2x16_f16");
1248       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1249
1250       /* uvec2 f32; */
1251       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1252                                             "tmp_unpack_half_2x16_f32");
1253
1254       /* Get f16's unshifted exponent bits.
1255        *
1256        *    uvec2 e = f16 & 0x7c00u;
1257        */
1258       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1259                                           "tmp_unpack_half_2x16_e");
1260       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1261
1262       /* Get f16's unshifted mantissa bits.
1263        *
1264        *    uvec2 m = f16 & 0x03ffu;
1265        */
1266       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1267                                           "tmp_unpack_half_2x16_m");
1268       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1269
1270       /* Set f32's exponent and mantissa bits.
1271        *
1272        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1273        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1274        */
1275       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1276                                                        swizzle_x(m)),
1277                            WRITEMASK_X));
1278       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1279                                                        swizzle_y(m)),
1280                            WRITEMASK_Y));
1281
1282       /* Set f32's sign bit.
1283        *
1284        *    f32 |= (f16 & 0x8000u) << 16u;
1285        */
1286       factory.emit(assign(f32, bit_or(f32,
1287                                        lshift(bit_and(f16,
1288                                                       constant(0x8000u)),
1289                                               constant(16u)))));
1290
1291       /* return bitcast_u2f(f32); */
1292       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1293       assert(result->type == glsl_type::vec2_type);
1294       return result;
1295    }
1296 };
1297
1298 } // namespace anonymous
1299
1300 /**
1301  * \brief Lower the builtin packing functions.
1302  *
1303  * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1304  */
1305 bool
1306 lower_packing_builtins(exec_list *instructions, int op_mask)
1307 {
1308    lower_packing_builtins_visitor v(op_mask);
1309    visit_list_elements(&v, instructions, true);
1310    return v.get_progress();
1311 }