src/glsl/lower_packing_builtins.cpp

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "ir.h"
  25 #include "ir_builder.h"
  26 #include "ir_optimization.h"
  27 #include "ir_rvalue_visitor.h"
  28
  29 namespace {
  30
  31 using namespace ir_builder;
  32
  33 /**
  34  * A visitor that lowers built-in floating-point pack/unpack expressions
  35  * such packSnorm2x16.
  36  */
  37 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
  38 public:
  39    /**
  40     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
  41     */
  42    explicit lower_packing_builtins_visitor(int op_mask)
  43       : op_mask(op_mask),
  44         progress(false)
  45    {
  46       /* Mutually exclusive options. */
  47       assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
  48                (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
  49
  50       assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
  51                (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
  52
  53       factory.instructions = &factory_instructions;
  54    }
  55
  56    virtual ~lower_packing_builtins_visitor()
  57    {
  58       assert(factory_instructions.is_empty());
  59    }
  60
  61    bool get_progress() { return progress; }
  62
  63    void handle_rvalue(ir_rvalue **rvalue)
  64    {
  65       if (!*rvalue)
  66          return;
  67
  68       ir_expression *expr = (*rvalue)->as_expression();
  69       if (!expr)
  70          return;
  71
  72       enum lower_packing_builtins_op lowering_op =
  73          choose_lowering_op(expr->operation);
  74
  75       if (lowering_op == LOWER_PACK_UNPACK_NONE)
  76          return;
  77
  78       setup_factory(ralloc_parent(expr));
  79
  80       ir_rvalue *op0 = expr->operands[0];
  81       ralloc_steal(factory.mem_ctx, op0);
  82
  83       switch (lowering_op) {
  84       case LOWER_PACK_SNORM_2x16:
  85          *rvalue = lower_pack_snorm_2x16(op0);
  86          break;
  87       case LOWER_PACK_SNORM_4x8:
  88          *rvalue = lower_pack_snorm_4x8(op0);
  89          break;
  90       case LOWER_PACK_UNORM_2x16:
  91          *rvalue = lower_pack_unorm_2x16(op0);
  92          break;
  93       case LOWER_PACK_UNORM_4x8:
  94          *rvalue = lower_pack_unorm_4x8(op0);
  95          break;
  96       case LOWER_PACK_HALF_2x16:
  97          *rvalue = lower_pack_half_2x16(op0);
  98          break;
  99       case LOWER_PACK_HALF_2x16_TO_SPLIT:
 100          *rvalue = split_pack_half_2x16(op0);
 101          break;
 102       case LOWER_UNPACK_SNORM_2x16:
 103          *rvalue = lower_unpack_snorm_2x16(op0);
 104          break;
 105       case LOWER_UNPACK_SNORM_4x8:
 106          *rvalue = lower_unpack_snorm_4x8(op0);
 107          break;
 108       case LOWER_UNPACK_UNORM_2x16:
 109          *rvalue = lower_unpack_unorm_2x16(op0);
 110          break;
 111       case LOWER_UNPACK_UNORM_4x8:
 112          *rvalue = lower_unpack_unorm_4x8(op0);
 113          break;
 114       case LOWER_UNPACK_HALF_2x16:
 115          *rvalue = lower_unpack_half_2x16(op0);
 116          break;
 117       case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
 118          *rvalue = split_unpack_half_2x16(op0);
 119          break;
 120       case LOWER_PACK_UNPACK_NONE:
 121          assert(!"not reached");
 122          break;
 123       }
 124
 125       teardown_factory();
 126       progress = true;
 127    }
 128
 129 private:
 130    const int op_mask;
 131    bool progress;
 132    ir_factory factory;
 133    exec_list factory_instructions;
 134
 135    /**
 136     * Determine the needed lowering operation by filtering \a expr_op
 137     * through \ref op_mask.
 138     */
 139    enum lower_packing_builtins_op
 140    choose_lowering_op(ir_expression_operation expr_op)
 141    {
 142       /* C++ regards int and enum as fundamentally different types.
 143        * So, we can't simply return from each case; we must cast the return
 144        * value.
 145        */
 146       int result;
 147
 148       switch (expr_op) {
 149       case ir_unop_pack_snorm_2x16:
 150          result = op_mask & LOWER_PACK_SNORM_2x16;
 151          break;
 152       case ir_unop_pack_snorm_4x8:
 153          result = op_mask & LOWER_PACK_SNORM_4x8;
 154          break;
 155       case ir_unop_pack_unorm_2x16:
 156          result = op_mask & LOWER_PACK_UNORM_2x16;
 157          break;
 158       case ir_unop_pack_unorm_4x8:
 159          result = op_mask & LOWER_PACK_UNORM_4x8;
 160          break;
 161       case ir_unop_pack_half_2x16:
 162          result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
 163          break;
 164       case ir_unop_unpack_snorm_2x16:
 165          result = op_mask & LOWER_UNPACK_SNORM_2x16;
 166          break;
 167       case ir_unop_unpack_snorm_4x8:
 168          result = op_mask & LOWER_UNPACK_SNORM_4x8;
 169          break;
 170       case ir_unop_unpack_unorm_2x16:
 171          result = op_mask & LOWER_UNPACK_UNORM_2x16;
 172          break;
 173       case ir_unop_unpack_unorm_4x8:
 174          result = op_mask & LOWER_UNPACK_UNORM_4x8;
 175          break;
 176       case ir_unop_unpack_half_2x16:
 177          result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
 178          break;
 179       default:
 180          result = LOWER_PACK_UNPACK_NONE;
 181          break;
 182       }
 183
 184       return static_cast<enum lower_packing_builtins_op>(result);
 185    }
 186
 187    void
 188    setup_factory(void *mem_ctx)
 189    {
 190       assert(factory.mem_ctx == NULL);
 191       assert(factory.instructions->is_empty());
 192
 193       factory.mem_ctx = mem_ctx;
 194    }
 195
 196    void
 197    teardown_factory()
 198    {
 199       base_ir->insert_before(factory.instructions);
 200       assert(factory.instructions->is_empty());
 201       factory.mem_ctx = NULL;
 202    }
 203
 204    template <typename T>
 205    ir_constant*
 206    constant(T x)
 207    {
 208       return factory.constant(x);
 209    }
 210
 211    /**
 212     * \brief Pack two uint16's into a single uint32.
 213     *
 214     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
 215     * where the least significant bits specify the first element of the pair.
 216     * Return the uint32.
 217     */
 218    ir_rvalue*
 219    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
 220    {
 221       assert(uvec2_rval->type == glsl_type::uvec2_type);
 222
 223       /* uvec2 u = UVEC2_RVAL; */
 224       ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
 225                                           "tmp_pack_uvec2_to_uint");
 226       factory.emit(assign(u, uvec2_rval));
 227
 228       /* return (u.y << 16) | (u.x & 0xffff); */
 229       return bit_or(lshift(swizzle_y(u), constant(16u)),
 230                     bit_and(swizzle_x(u), constant(0xffffu)));
 231    }
 232
 233    /**
 234     * \brief Pack four uint8's into a single uint32.
 235     *
 236     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
 237     * uint32 where the least significant bits specify the first element of the
 238     * 4-tuple. Return the uint32.
 239     */
 240    ir_rvalue*
 241    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
 242    {
 243       assert(uvec4_rval->type == glsl_type::uvec4_type);
 244
 245       /* uvec4 u = UVEC4_RVAL; */
 246       ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
 247                                           "tmp_pack_uvec4_to_uint");
 248       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
 249
 250       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
 251       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
 252                            lshift(swizzle_z(u), constant(16u))),
 253                     bit_or(lshift(swizzle_y(u), constant(8u)),
 254                            swizzle_x(u)));
 255    }
 256
 257    /**
 258     * \brief Unpack a uint32 into two uint16's.
 259     *
 260     * Interpret the given uint32 as a uint16 pair where the uint32's least
 261     * significant bits specify the pair's first element. Return the uint16
 262     * pair as a uvec2.
 263     */
 264    ir_rvalue*
 265    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
 266    {
 267       assert(uint_rval->type == glsl_type::uint_type);
 268
 269       /* uint u = UINT_RVAL; */
 270       ir_variable *u = factory.make_temp(glsl_type::uint_type,
 271                                           "tmp_unpack_uint_to_uvec2_u");
 272       factory.emit(assign(u, uint_rval));
 273
 274       /* uvec2 u2; */
 275       ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
 276                                            "tmp_unpack_uint_to_uvec2_u2");
 277
 278       /* u2.x = u & 0xffffu; */
 279       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
 280
 281       /* u2.y = u >> 16u; */
 282       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
 283
 284       return deref(u2).val;
 285    }
 286
 287    /**
 288     * \brief Unpack a uint32 into four uint8's.
 289     *
 290     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
 291     * significant bits specify the 4-tuple's first element. Return the uint8
 292     * 4-tuple as a uvec4.
 293     */
 294    ir_rvalue*
 295    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
 296    {
 297       assert(uint_rval->type == glsl_type::uint_type);
 298
 299       /* uint u = UINT_RVAL; */
 300       ir_variable *u = factory.make_temp(glsl_type::uint_type,
 301                                           "tmp_unpack_uint_to_uvec4_u");
 302       factory.emit(assign(u, uint_rval));
 303
 304       /* uvec4 u4; */
 305       ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
 306                                            "tmp_unpack_uint_to_uvec4_u4");
 307
 308       /* u4.x = u & 0xffu; */
 309       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
 310
 311       /* u4.y = (u >> 8u) & 0xffu; */
 312       factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
 313                                       constant(0xffu)), WRITEMASK_Y));
 314
 315       /* u4.z = (u >> 16u) & 0xffu; */
 316       factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
 317                                       constant(0xffu)), WRITEMASK_Z));
 318
 319       /* u4.w = (u >> 24u) */
 320       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
 321
 322       return deref(u4).val;
 323    }
 324
 325    /**
 326     * \brief Lower a packSnorm2x16 expression.
 327     *
 328     * \param vec2_rval is packSnorm2x16's input
 329     * \return packSnorm2x16's output as a uint rvalue
 330     */
 331    ir_rvalue*
 332    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
 333    {
 334       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 335        *
 336        *    highp uint packSnorm2x16(vec2 v)
 337        *    --------------------------------
 338        *    First, converts each component of the normalized floating-point value
 339        *    v into 16-bit integer values. Then, the results are packed into the
 340        *    returned 32-bit unsigned integer.
 341        *
 342        *    The conversion for component c of v to fixed point is done as
 343        *    follows:
 344        *
 345        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
 346        *
 347        *    The first component of the vector will be written to the least
 348        *    significant bits of the output; the last component will be written to
 349        *    the most significant bits.
 350        *
 351        * This function generates IR that approximates the following pseudo-GLSL:
 352        *
 353        *     return pack_uvec2_to_uint(
 354        *         uvec2(ivec2(
 355        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
 356        *
 357        * It is necessary to first convert the vec2 to ivec2 rather than directly
 358        * converting vec2 to uvec2 because the latter conversion is undefined.
 359        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
 360        * convert a negative floating point value to an uint".
 361        */
 362       assert(vec2_rval->type == glsl_type::vec2_type);
 363
 364       ir_rvalue *result = pack_uvec2_to_uint(
 365             i2u(f2i(round_even(mul(clamp(vec2_rval,
 366                                          constant(-1.0f),
 367                                          constant(1.0f)),
 368                                    constant(32767.0f))))));
 369
 370       assert(result->type == glsl_type::uint_type);
 371       return result;
 372    }
 373
 374    /**
 375     * \brief Lower a packSnorm4x8 expression.
 376     *
 377     * \param vec4_rval is packSnorm4x8's input
 378     * \return packSnorm4x8's output as a uint rvalue
 379     */
 380    ir_rvalue*
 381    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
 382    {
 383       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 384        *
 385        *    highp uint packSnorm4x8(vec4 v)
 386        *    -------------------------------
 387        *    First, converts each component of the normalized floating-point value
 388        *    v into 8-bit integer values. Then, the results are packed into the
 389        *    returned 32-bit unsigned integer.
 390        *
 391        *    The conversion for component c of v to fixed point is done as
 392        *    follows:
 393        *
 394        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
 395        *
 396        *    The first component of the vector will be written to the least
 397        *    significant bits of the output; the last component will be written to
 398        *    the most significant bits.
 399        *
 400        * This function generates IR that approximates the following pseudo-GLSL:
 401        *
 402        *     return pack_uvec4_to_uint(
 403        *         uvec4(ivec4(
 404        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
 405        *
 406        * It is necessary to first convert the vec4 to ivec4 rather than directly
 407        * converting vec4 to uvec4 because the latter conversion is undefined.
 408        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
 409        * convert a negative floating point value to an uint".
 410        */
 411       assert(vec4_rval->type == glsl_type::vec4_type);
 412
 413       ir_rvalue *result = pack_uvec4_to_uint(
 414             i2u(f2i(round_even(mul(clamp(vec4_rval,
 415                                          constant(-1.0f),
 416                                          constant(1.0f)),
 417                                    constant(127.0f))))));
 418
 419       assert(result->type == glsl_type::uint_type);
 420       return result;
 421    }
 422
 423    /**
 424     * \brief Lower an unpackSnorm2x16 expression.
 425     *
 426     * \param uint_rval is unpackSnorm2x16's input
 427     * \return unpackSnorm2x16's output as a vec2 rvalue
 428     */
 429    ir_rvalue*
 430    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
 431    {
 432       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 433        *
 434        *    highp vec2 unpackSnorm2x16 (highp uint p)
 435        *    -----------------------------------------
 436        *    First, unpacks a single 32-bit unsigned integer p into a pair of
 437        *    16-bit unsigned integers. Then, each component is converted to
 438        *    a normalized floating-point value to generate the returned
 439        *    two-component vector.
 440        *
 441        *    The conversion for unpacked fixed-point value f to floating point is
 442        *    done as follows:
 443        *
 444        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
 445        *
 446        *    The first component of the returned vector will be extracted from the
 447        *    least significant bits of the input; the last component will be
 448        *    extracted from the most significant bits.
 449        *
 450        * This function generates IR that approximates the following pseudo-GLSL:
 451        *
 452        *    return clamp(
 453        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
 454        *       -1.0f, 1.0f);
 455        *
 456        * The above IR may appear unnecessarily complex, but the intermediate
 457        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
 458        * negative floats.
 459        *
 460        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
 461        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
 462        * place that int16 into an int32, which results in the *positive* integer
 463        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
 464        * unimportant bit 16. We must now extend the int16's sign bit into bits
 465        * 17-32, which is accomplished by left-shifting then right-shifting.
 466        */
 467
 468       assert(uint_rval->type == glsl_type::uint_type);
 469
 470       ir_rvalue *result =
 471         clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
 472                                     constant(16)),
 473                              constant(16u))),
 474                   constant(32767.0f)),
 475               constant(-1.0f),
 476               constant(1.0f));
 477
 478       assert(result->type == glsl_type::vec2_type);
 479       return result;
 480    }
 481
 482    /**
 483     * \brief Lower an unpackSnorm4x8 expression.
 484     *
 485     * \param uint_rval is unpackSnorm4x8's input
 486     * \return unpackSnorm4x8's output as a vec4 rvalue
 487     */
 488    ir_rvalue*
 489    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
 490    {
 491       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 492        *
 493        *    highp vec4 unpackSnorm4x8 (highp uint p)
 494        *    ----------------------------------------
 495        *    First, unpacks a single 32-bit unsigned integer p into four
 496        *    8-bit unsigned integers. Then, each component is converted to
 497        *    a normalized floating-point value to generate the returned
 498        *    four-component vector.
 499        *
 500        *    The conversion for unpacked fixed-point value f to floating point is
 501        *    done as follows:
 502        *
 503        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
 504        *
 505        *    The first component of the returned vector will be extracted from the
 506        *    least significant bits of the input; the last component will be
 507        *    extracted from the most significant bits.
 508        *
 509        * This function generates IR that approximates the following pseudo-GLSL:
 510        *
 511        *    return clamp(
 512        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
 513        *       -1.0f, 1.0f);
 514        *
 515        * The above IR may appear unnecessarily complex, but the intermediate
 516        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
 517        * negative floats.
 518        *
 519        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
 520        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
 521        * place that int8 into an int32, which results in the *positive* integer
 522        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
 523        * unimportant bit 8. We must now extend the int8's sign bit into bits
 524        * 9-32, which is accomplished by left-shifting then right-shifting.
 525        */
 526
 527       assert(uint_rval->type == glsl_type::uint_type);
 528
 529       ir_rvalue *result =
 530         clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
 531                                     constant(24u)),
 532                              constant(24u))),
 533                   constant(127.0f)),
 534               constant(-1.0f),
 535               constant(1.0f));
 536
 537       assert(result->type == glsl_type::vec4_type);
 538       return result;
 539    }
 540
 541    /**
 542     * \brief Lower a packUnorm2x16 expression.
 543     *
 544     * \param vec2_rval is packUnorm2x16's input
 545     * \return packUnorm2x16's output as a uint rvalue
 546     */
 547    ir_rvalue*
 548    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
 549    {
 550       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 551        *
 552        *    highp uint packUnorm2x16 (vec2 v)
 553        *    ---------------------------------
 554        *    First, converts each component of the normalized floating-point value
 555        *    v into 16-bit integer values. Then, the results are packed into the
 556        *    returned 32-bit unsigned integer.
 557        *
 558        *    The conversion for component c of v to fixed point is done as
 559        *    follows:
 560        *
 561        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
 562        *
 563        *    The first component of the vector will be written to the least
 564        *    significant bits of the output; the last component will be written to
 565        *    the most significant bits.
 566        *
 567        * This function generates IR that approximates the following pseudo-GLSL:
 568        *
 569        *     return pack_uvec2_to_uint(uvec2(
 570        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
 571        *
 572        * Here it is safe to directly convert the vec2 to uvec2 because the vec2
 573        * has been clamped to a non-negative range.
 574        */
 575
 576       assert(vec2_rval->type == glsl_type::vec2_type);
 577
 578       ir_rvalue *result = pack_uvec2_to_uint(
 579          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
 580
 581       assert(result->type == glsl_type::uint_type);
 582       return result;
 583    }
 584
 585    /**
 586     * \brief Lower a packUnorm4x8 expression.
 587     *
 588     * \param vec4_rval is packUnorm4x8's input
 589     * \return packUnorm4x8's output as a uint rvalue
 590     */
 591    ir_rvalue*
 592    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
 593    {
 594       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 595        *
 596        *    highp uint packUnorm4x8 (vec4 v)
 597        *    --------------------------------
 598        *    First, converts each component of the normalized floating-point value
 599        *    v into 8-bit integer values. Then, the results are packed into the
 600        *    returned 32-bit unsigned integer.
 601        *
 602        *    The conversion for component c of v to fixed point is done as
 603        *    follows:
 604        *
 605        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
 606        *
 607        *    The first component of the vector will be written to the least
 608        *    significant bits of the output; the last component will be written to
 609        *    the most significant bits.
 610        *
 611        * This function generates IR that approximates the following pseudo-GLSL:
 612        *
 613        *     return pack_uvec4_to_uint(uvec4(
 614        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
 615        *
 616        * Here it is safe to directly convert the vec4 to uvec4 because the vec4
 617        * has been clamped to a non-negative range.
 618        */
 619
 620       assert(vec4_rval->type == glsl_type::vec4_type);
 621
 622       ir_rvalue *result = pack_uvec4_to_uint(
 623          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
 624
 625       assert(result->type == glsl_type::uint_type);
 626       return result;
 627    }
 628
 629    /**
 630     * \brief Lower an unpackUnorm2x16 expression.
 631     *
 632     * \param uint_rval is unpackUnorm2x16's input
 633     * \return unpackUnorm2x16's output as a vec2 rvalue
 634     */
 635    ir_rvalue*
 636    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
 637    {
 638       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
 639        *
 640        *    highp vec2 unpackUnorm2x16 (highp uint p)
 641        *    -----------------------------------------
 642        *    First, unpacks a single 32-bit unsigned integer p into a pair of
 643        *    16-bit unsigned integers. Then, each component is converted to
 644        *    a normalized floating-point value to generate the returned
 645        *    two-component vector.
 646        *
 647        *    The conversion for unpacked fixed-point value f to floating point is
 648        *    done as follows:
 649        *
 650        *       unpackUnorm2x16: f / 65535.0
 651        *
 652        *    The first component of the returned vector will be extracted from the
 653        *    least significant bits of the input; the last component will be
 654        *    extracted from the most significant bits.
 655        *
 656        * This function generates IR that approximates the following pseudo-GLSL:
 657        *
 658        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
 659        */
 660
 661       assert(uint_rval->type == glsl_type::uint_type);
 662
 663       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
 664                               constant(65535.0f));
 665
 666       assert(result->type == glsl_type::vec2_type);
 667       return result;
 668    }
 669
 670    /**
 671     * \brief Lower an unpackUnorm4x8 expression.
 672     *
 673     * \param uint_rval is unpackUnorm4x8's input
 674     * \return unpackUnorm4x8's output as a vec4 rvalue
 675     */
 676    ir_rvalue*
 677    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
 678    {
 679       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 680        *
 681        *    highp vec4 unpackUnorm4x8 (highp uint p)
 682        *    ----------------------------------------
 683        *    First, unpacks a single 32-bit unsigned integer p into four
 684        *    8-bit unsigned integers. Then, each component is converted to
 685        *    a normalized floating-point value to generate the returned
 686        *    two-component vector.
 687        *
 688        *    The conversion for unpacked fixed-point value f to floating point is
 689        *    done as follows:
 690        *
 691        *       unpackUnorm4x8: f / 255.0
 692        *
 693        *    The first component of the returned vector will be extracted from the
 694        *    least significant bits of the input; the last component will be
 695        *    extracted from the most significant bits.
 696        *
 697        * This function generates IR that approximates the following pseudo-GLSL:
 698        *
 699        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
 700        */
 701
 702       assert(uint_rval->type == glsl_type::uint_type);
 703
 704       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
 705                               constant(255.0f));
 706
 707       assert(result->type == glsl_type::vec4_type);
 708       return result;
 709    }
 710
 711    /**
 712     * \brief Lower the component-wise calculation of packHalf2x16.
 713     *
 714     * \param f_rval is one component of packHafl2x16's input
 715     * \param e_rval is the unshifted exponent bits of f_rval
 716     * \param m_rval is the unshifted mantissa bits of f_rval
 717     *
 718     * \return a uint rvalue that encodes a float16 in its lower 16 bits
 719     */
 720    ir_rvalue*
 721    pack_half_1x16_nosign(ir_rvalue *f_rval,
 722                          ir_rvalue *e_rval,
 723                          ir_rvalue *m_rval)
 724    {
 725       assert(e_rval->type == glsl_type::uint_type);
 726       assert(m_rval->type == glsl_type::uint_type);
 727
 728       /* uint u16; */
 729       ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
 730                                            "tmp_pack_half_1x16_u16");
 731
 732       /* float f = FLOAT_RVAL; */
 733       ir_variable *f = factory.make_temp(glsl_type::float_type,
 734                                           "tmp_pack_half_1x16_f");
 735       factory.emit(assign(f, f_rval));
 736
 737       /* uint e = E_RVAL; */
 738       ir_variable *e = factory.make_temp(glsl_type::uint_type,
 739                                           "tmp_pack_half_1x16_e");
 740       factory.emit(assign(e, e_rval));
 741
 742       /* uint m = M_RVAL; */
 743       ir_variable *m = factory.make_temp(glsl_type::uint_type,
 744                                           "tmp_pack_half_1x16_m");
 745       factory.emit(assign(m, m_rval));
 746
 747       /* Preliminaries
 748        * -------------
 749        *
 750        * For a float16, the bit layout is:
 751        *
 752        *   sign:     15
 753        *   exponent: 10:14
 754        *   mantissa: 0:9
 755        *
 756        * Let f16 be a float16 value. The sign, exponent, and mantissa
 757        * determine its value thus:
 758        *
 759        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
 760        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
 761        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
 762        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
 763        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
 764        *
 765        * where 0 <= m16 < 2^10.
 766        *
 767        * For a float32, the bit layout is:
 768        *
 769        *   sign:     31
 770        *   exponent: 23:30
 771        *   mantissa: 0:22
 772        *
 773        * Let f32 be a float32 value. The sign, exponent, and mantissa
 774        * determine its value thus:
 775        *
 776        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
 777        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
 778        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
 779        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
 780        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
 781        *
 782        * where 0 <= m32 < 2^23.
 783        *
 784        * The minimum and maximum normal float16 values are
 785        *
 786        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
 787        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
 788        *
 789        * The step at max_norm16 is
 790        *
 791        *   max_step16 = 2^5                                     (22)
 792        *
 793        * Observe that the float16 boundary values in equations 20-21 lie in the
 794        * range of normal float32 values.
 795        *
 796        *
 797        * Rounding Behavior
 798        * -----------------
 799        * Not all float32 values can be exactly represented as a float16. We
 800        * round all such intermediate float32 values to the nearest float16; if
 801        * the float32 is exactly between to float16 values, we round to the one
 802        * with an even mantissa. This rounding behavior has several benefits:
 803        *
 804        *   - It has no sign bias.
 805        *
 806        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
 807        *     GPU ISA.
 808        *
 809        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
 810        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
 811        *     result in the same value as if the expression were executed on the
 812        *     GPU.
 813        *
 814        * Calculation
 815        * -----------
 816        * Our task is to compute s16, e16, m16 given f32.  Since this function
 817        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
 818        * cases consider.
 819        */
 820
 821       factory.emit(
 822
 823          /* Case 1) f32 is NaN
 824           *
 825           *   The resultant f16 will also be NaN.
 826           */
 827
 828          /* if (e32 == 255 && m32 != 0) { */
 829          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
 830                            logic_not(equal(m, constant(0u)))),
 831
 832             assign(u16, constant(0x7fffu)),
 833
 834          /* Case 2) f32 lies in the range [0, min_norm16).
 835           *
 836           *   The resultant float16 will be either zero, subnormal, or normal.
 837           *
 838           *   Solving
 839           *
 840           *     f32 = min_norm16       (30)
 841           *
 842           *   gives
 843           *
 844           *     e32 = 113 and m32 = 0  (31)
 845           *
 846           *   Therefore this case occurs if and only if
 847           *
 848           *     e32 < 113              (32)
 849           */
 850
 851          /* } else if (e32 < 113) { */
 852          if_tree(less(e, constant(113u << 23u)),
 853
 854             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
 855             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
 856                                            constant((float) (1 << 24)))))),
 857
 858          /* Case 3) f32 lies in the range
 859           *         [min_norm16, max_norm16 + max_step16).
 860           *
 861           *   The resultant float16 will be either normal or infinite.
 862           *
 863           *   Solving
 864           *
 865           *     f32 = max_norm16 + max_step16           (40)
 866           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
 867           *         = 2^16                              (42)
 868           *   gives
 869           *
 870           *     e32 = 143 and m32 = 0                   (43)
 871           *
 872           *   We already solved the boundary condition f32 = min_norm16 above
 873           *   in equation 31. Therefore this case occurs if and only if
 874           *
 875           *     113 <= e32 and e32 < 143
 876           */
 877
 878          /* } else if (e32 < 143) { */
 879          if_tree(less(e, constant(143u << 23u)),
 880
 881             /* The addition below handles the case where the mantissa rounds
 882              * up to 1024 and bumps the exponent.
 883              *
 884              * u16 = ((e - (112u << 23u)) >> 13u)
 885              *     + round_to_even((float(m) / (1u << 13u));
 886              */
 887             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
 888                                    constant(13u)),
 889                             f2u(round_even(
 890                                   div(u2f(m), constant((float) (1 << 13))))))),
 891
 892          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
 893           *
 894           *   The resultant float16 will be infinite.
 895           *
 896           *   The cases above caught all float32 values in the range
 897           *   [0, max_norm16 + max_step16), so this is the fall-through case.
 898           */
 899
 900          /* } else { */
 901
 902             assign(u16, constant(31u << 10u))))));
 903
 904          /* } */
 905
 906        return deref(u16).val;
 907    }
 908
 909    /**
 910     * \brief Lower a packHalf2x16 expression.
 911     *
 912     * \param vec2_rval is packHalf2x16's input
 913     * \return packHalf2x16's output as a uint rvalue
 914     */
 915    ir_rvalue*
 916    lower_pack_half_2x16(ir_rvalue *vec2_rval)
 917    {
 918       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
 919        *
 920        *    highp uint packHalf2x16 (mediump vec2 v)
 921        *    ----------------------------------------
 922        *    Returns an unsigned integer obtained by converting the components of
 923        *    a two-component floating-point vector to the 16-bit floating-point
 924        *    representation found in the OpenGL ES Specification, and then packing
 925        *    these two 16-bit integers into a 32-bit unsigned integer.
 926        *
 927        *    The first vector component specifies the 16 least- significant bits
 928        *    of the result; the second component specifies the 16 most-significant
 929        *    bits.
 930        */
 931
 932       assert(vec2_rval->type == glsl_type::vec2_type);
 933
 934       /* vec2 f = VEC2_RVAL; */
 935       ir_variable *f = factory.make_temp(glsl_type::vec2_type,
 936                                          "tmp_pack_half_2x16_f");
 937       factory.emit(assign(f, vec2_rval));
 938
 939       /* uvec2 f32 = bitcast_f2u(f); */
 940       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
 941                                             "tmp_pack_half_2x16_f32");
 942       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
 943
 944       /* uvec2 f16; */
 945       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
 946                                         "tmp_pack_half_2x16_f16");
 947
 948       /* Get f32's unshifted exponent bits.
 949        *
 950        *   uvec2 e = f32 & 0x7f800000u;
 951        */
 952       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
 953                                           "tmp_pack_half_2x16_e");
 954       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
 955
 956       /* Get f32's unshifted mantissa bits.
 957        *
 958        *   uvec2 m = f32 & 0x007fffffu;
 959        */
 960       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
 961                                           "tmp_pack_half_2x16_m");
 962       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
 963
 964       /* Set f16's exponent and mantissa bits.
 965        *
 966        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
 967        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
 968        */
 969       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
 970                                                      swizzle_x(e),
 971                                                      swizzle_x(m)),
 972                            WRITEMASK_X));
 973       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
 974                                                      swizzle_y(e),
 975                                                      swizzle_y(m)),
 976                            WRITEMASK_Y));
 977
 978       /* Set f16's sign bits.
 979        *
 980        *   f16 |= (f32 & (1u << 31u) >> 16u;
 981        */
 982       factory.emit(
 983          assign(f16, bit_or(f16,
 984                             rshift(bit_and(f32, constant(1u << 31u)),
 985                                    constant(16u)))));
 986
 987
 988       /* return (f16.y << 16u) | f16.x; */
 989       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
 990                                         constant(16u)),
 991                                  swizzle_x(f16));
 992
 993       assert(result->type == glsl_type::uint_type);
 994       return result;
 995    }
 996
 997    /**
 998     * \brief Split packHalf2x16's vec2 operand into two floats.
 999     *
1000     * \param vec2_rval is packHalf2x16's input
1001     * \return a uint rvalue
1002     *
1003     * Some code generators, such as the i965 fragment shader, require that all
1004     * vector expressions be lowered to a sequence of scalar expressions.
1005     * However, packHalf2x16 cannot be scalarized by the same mechanism as
1006     * a true vector operation because its input and output have a differing
1007     * number of vector components.
1008     *
1009     * This method scalarizes packHalf2x16 by transforming it from an unary
1010     * operation having vector input to a binary operation having scalar input.
1011     * That is, it transforms
1012     *
1013     *    packHalf2x16(VEC2_RVAL);
1014     *
1015     * into
1016     *
1017     *    vec2 v = VEC2_RVAL;
1018     *    return packHalf2x16_split(v.x, v.y);
1019     */
1020    ir_rvalue*
1021    split_pack_half_2x16(ir_rvalue *vec2_rval)
1022    {
1023       assert(vec2_rval->type == glsl_type::vec2_type);
1024
1025       ir_variable *v = factory.make_temp(glsl_type::vec2_type,
1026                                          "tmp_split_pack_half_2x16_v");
1027       factory.emit(assign(v, vec2_rval));
1028
1029       return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
1030    }
1031
1032    /**
1033     * \brief Lower the component-wise calculation of unpackHalf2x16.
1034     *
1035     * Given a uint that encodes a float16 in its lower 16 bits, this function
1036     * returns a uint that encodes a float32 with the same value. The sign bit
1037     * of the float16 is ignored.
1038     *
1039     * \param e_rval is the unshifted exponent bits of a float16
1040     * \param m_rval is the unshifted mantissa bits of a float16
1041     * \param a uint rvalue that encodes a float32
1042     */
1043    ir_rvalue*
1044    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1045    {
1046       assert(e_rval->type == glsl_type::uint_type);
1047       assert(m_rval->type == glsl_type::uint_type);
1048
1049       /* uint u32; */
1050       ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1051                                            "tmp_unpack_half_1x16_u32");
1052
1053       /* uint e = E_RVAL; */
1054       ir_variable *e = factory.make_temp(glsl_type::uint_type,
1055                                           "tmp_unpack_half_1x16_e");
1056       factory.emit(assign(e, e_rval));
1057
1058       /* uint m = M_RVAL; */
1059       ir_variable *m = factory.make_temp(glsl_type::uint_type,
1060                                           "tmp_unpack_half_1x16_m");
1061       factory.emit(assign(m, m_rval));
1062
1063       /* Preliminaries
1064        * -------------
1065        *
1066        * For a float16, the bit layout is:
1067        *
1068        *   sign:     15
1069        *   exponent: 10:14
1070        *   mantissa: 0:9
1071        *
1072        * Let f16 be a float16 value. The sign, exponent, and mantissa
1073        * determine its value thus:
1074        *
1075        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1076        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1077        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1078        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1079        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1080        *
1081        * where 0 <= m16 < 2^10.
1082        *
1083        * For a float32, the bit layout is:
1084        *
1085        *   sign: 31
1086        *   exponent: 23:30
1087        *   mantissa: 0:22
1088        *
1089        * Let f32 be a float32 value. The sign, exponent, and mantissa
1090        * determine its value thus:
1091        *
1092        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1093        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1094        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1095        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1096        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1097        *
1098        * where 0 <= m32 < 2^23.
1099        *
1100        * Calculation
1101        * -----------
1102        * Our task is to compute s32, e32, m32 given f16.  Since this function
1103        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1104        * cases consider.
1105        */
1106
1107       factory.emit(
1108
1109          /* Case 1) f16 is zero or subnormal.
1110           *
1111           *   The simplest method of calcuating f32 in this case is
1112           *
1113           *     f32 = f16                       (20)
1114           *         = 2^(-14) * (m16 / 2^10)    (21)
1115           *         = m16 / 2^(-24)             (22)
1116           */
1117
1118          /* if (e16 == 0) { */
1119          if_tree(equal(e, constant(0u)),
1120
1121             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1122             assign(u32, expr(ir_unop_bitcast_f2u,
1123                                 div(u2f(m), constant((float)(1 << 24))))),
1124
1125          /* Case 2) f16 is normal.
1126           *
1127           *   The equation
1128           *
1129           *     f32 = f16                              (30)
1130           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1131           *       2^(e16 - 15) * (1 + m16 / 2^10)
1132           *
1133           *   can be decomposed into two
1134           *
1135           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1136           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1137           *
1138           *   which solve to
1139           *
1140           *     e32 = e16 + 112                        (34)
1141           *     m32 = m16 * 2^13                       (35)
1142           */
1143
1144          /* } else if (e16 < 31)) { */
1145          if_tree(less(e, constant(31u << 10u)),
1146
1147               /* u32 = ((e + (112 << 10)) | m) << 13;
1148                */
1149               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1150                                  constant(13u))),
1151
1152
1153          /* Case 3) f16 is infinite. */
1154          if_tree(equal(m, constant(0u)),
1155
1156                  assign(u32, constant(255u << 23u)),
1157
1158          /* Case 4) f16 is NaN. */
1159          /* } else { */
1160
1161             assign(u32, constant(0x7fffffffu))))));
1162
1163          /* } */
1164
1165       return deref(u32).val;
1166    }
1167
1168    /**
1169     * \brief Lower an unpackHalf2x16 expression.
1170     *
1171     * \param uint_rval is unpackHalf2x16's input
1172     * \return unpackHalf2x16's output as a vec2 rvalue
1173     */
1174    ir_rvalue*
1175    lower_unpack_half_2x16(ir_rvalue *uint_rval)
1176    {
1177       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1178        *
1179        *    mediump vec2 unpackHalf2x16 (highp uint v)
1180        *    ------------------------------------------
1181        *    Returns a two-component floating-point vector with components
1182        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1183        *    values, interpreting those values as 16-bit floating-point numbers
1184        *    according to the OpenGL ES Specification, and converting them to
1185        *    32-bit floating-point values.
1186        *
1187        *    The first component of the vector is obtained from the
1188        *    16 least-significant bits of v; the second component is obtained
1189        *    from the 16 most-significant bits of v.
1190        */
1191       assert(uint_rval->type == glsl_type::uint_type);
1192
1193       /* uint u = RVALUE;
1194        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1195        */
1196       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1197                                             "tmp_unpack_half_2x16_f16");
1198       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1199
1200       /* uvec2 f32; */
1201       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1202                                             "tmp_unpack_half_2x16_f32");
1203
1204       /* Get f16's unshifted exponent bits.
1205        *
1206        *    uvec2 e = f16 & 0x7c00u;
1207        */
1208       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1209                                           "tmp_unpack_half_2x16_e");
1210       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1211
1212       /* Get f16's unshifted mantissa bits.
1213        *
1214        *    uvec2 m = f16 & 0x03ffu;
1215        */
1216       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1217                                           "tmp_unpack_half_2x16_m");
1218       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1219
1220       /* Set f32's exponent and mantissa bits.
1221        *
1222        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1223        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1224        */
1225       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1226                                                        swizzle_x(m)),
1227                            WRITEMASK_X));
1228       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1229                                                        swizzle_y(m)),
1230                            WRITEMASK_Y));
1231
1232       /* Set f32's sign bit.
1233        *
1234        *    f32 |= (f16 & 0x8000u) << 16u;
1235        */
1236       factory.emit(assign(f32, bit_or(f32,
1237                                        lshift(bit_and(f16,
1238                                                       constant(0x8000u)),
1239                                               constant(16u)))));
1240
1241       /* return bitcast_u2f(f32); */
1242       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1243       assert(result->type == glsl_type::vec2_type);
1244       return result;
1245    }
1246
1247    /**
1248     * \brief Split unpackHalf2x16 into two operations.
1249     *
1250     * \param uint_rval is unpackHalf2x16's input
1251     * \return a vec2 rvalue
1252     *
1253     * Some code generators, such as the i965 fragment shader, require that all
1254     * vector expressions be lowered to a sequence of scalar expressions.
1255     * However, unpackHalf2x16 cannot be scalarized by the same method as
1256     * a true vector operation because the number of components of its input
1257     * and output differ.
1258     *
1259     * This method scalarizes unpackHalf2x16 by transforming it from a single
1260     * operation having vec2 output to a pair of operations each having float
1261     * output. That is, it transforms
1262     *
1263     *   unpackHalf2x16(UINT_RVAL)
1264     *
1265     * into
1266     *
1267     *   uint u = UINT_RVAL;
1268     *   vec2 v;
1269     *
1270     *   v.x = unpackHalf2x16_split_x(u);
1271     *   v.y = unpackHalf2x16_split_y(u);
1272     *
1273     *   return v;
1274     */
1275    ir_rvalue*
1276    split_unpack_half_2x16(ir_rvalue *uint_rval)
1277    {
1278       assert(uint_rval->type == glsl_type::uint_type);
1279
1280       /* uint u = uint_rval; */
1281       ir_variable *u = factory.make_temp(glsl_type::uint_type,
1282                                           "tmp_split_unpack_half_2x16_u");
1283       factory.emit(assign(u, uint_rval));
1284
1285       /* vec2 v; */
1286       ir_variable *v = factory.make_temp(glsl_type::vec2_type,
1287                                           "tmp_split_unpack_half_2x16_v");
1288
1289       /* v.x = unpack_half_2x16_split_x(u); */
1290       factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
1291                            WRITEMASK_X));
1292
1293       /* v.y = unpack_half_2x16_split_y(u); */
1294       factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
1295                            WRITEMASK_Y));
1296
1297       return deref(v).val;
1298    }
1299 };
1300
1301 } // namespace anonymous
1302
1303 /**
1304  * \brief Lower the builtin packing functions.
1305  *
1306  * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1307  */
1308 bool
1309 lower_packing_builtins(exec_list *instructions, int op_mask)
1310 {
1311    lower_packing_builtins_visitor v(op_mask);
1312    visit_list_elements(&v, instructions, true);
1313    return v.get_progress();
1314 }