src/glsl/lower_packing_builtins.cpp

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "ir.h"
  25 #include "ir_builder.h"
  26 #include "ir_optimization.h"
  27 #include "ir_rvalue_visitor.h"
  28
  29 namespace {
  30
  31 using namespace ir_builder;
  32
  33 /**
  34  * A visitor that lowers built-in floating-point pack/unpack expressions
  35  * such packSnorm2x16.
  36  */
  37 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
  38 public:
  39    /**
  40     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
  41     */
  42    explicit lower_packing_builtins_visitor(int op_mask)
  43       : op_mask(op_mask),
  44         progress(false)
  45    {
  46       /* Mutually exclusive options. */
  47       assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
  48                (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
  49
  50       assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
  51                (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
  52
  53       factory.instructions = &factory_instructions;
  54    }
  55
  56    virtual ~lower_packing_builtins_visitor()
  57    {
  58       assert(factory_instructions.is_empty());
  59    }
  60
  61    bool get_progress() { return progress; }
  62
  63    void handle_rvalue(ir_rvalue **rvalue)
  64    {
  65       if (!*rvalue)
  66          return;
  67
  68       ir_expression *expr = (*rvalue)->as_expression();
  69       if (!expr)
  70          return;
  71
  72       enum lower_packing_builtins_op lowering_op =
  73          choose_lowering_op(expr->operation);
  74
  75       if (lowering_op == LOWER_PACK_UNPACK_NONE)
  76          return;
  77
  78       setup_factory(ralloc_parent(expr));
  79
  80       ir_rvalue *op0 = expr->operands[0];
  81       ralloc_steal(factory.mem_ctx, op0);
  82
  83       switch (lowering_op) {
  84       case LOWER_PACK_SNORM_2x16:
  85          *rvalue = lower_pack_snorm_2x16(op0);
  86          break;
  87       case LOWER_PACK_SNORM_4x8:
  88          *rvalue = lower_pack_snorm_4x8(op0);
  89          break;
  90       case LOWER_PACK_UNORM_2x16:
  91          *rvalue = lower_pack_unorm_2x16(op0);
  92          break;
  93       case LOWER_PACK_UNORM_4x8:
  94          *rvalue = lower_pack_unorm_4x8(op0);
  95          break;
  96       case LOWER_PACK_HALF_2x16:
  97          *rvalue = lower_pack_half_2x16(op0);
  98          break;
  99       case LOWER_PACK_HALF_2x16_TO_SPLIT:
 100          *rvalue = split_pack_half_2x16(op0);
 101          break;
 102       case LOWER_UNPACK_SNORM_2x16:
 103          *rvalue = lower_unpack_snorm_2x16(op0);
 104          break;
 105       case LOWER_UNPACK_SNORM_4x8:
 106          *rvalue = lower_unpack_snorm_4x8(op0);
 107          break;
 108       case LOWER_UNPACK_UNORM_2x16:
 109          *rvalue = lower_unpack_unorm_2x16(op0);
 110          break;
 111       case LOWER_UNPACK_UNORM_4x8:
 112          *rvalue = lower_unpack_unorm_4x8(op0);
 113          break;
 114       case LOWER_UNPACK_HALF_2x16:
 115          *rvalue = lower_unpack_half_2x16(op0);
 116          break;
 117       case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
 118          *rvalue = split_unpack_half_2x16(op0);
 119          break;
 120       case LOWER_PACK_UNPACK_NONE:
 121       case LOWER_PACK_USE_BFI:
 122       case LOWER_PACK_USE_BFE:
 123          assert(!"not reached");
 124          break;
 125       }
 126
 127       teardown_factory();
 128       progress = true;
 129    }
 130
 131 private:
 132    const int op_mask;
 133    bool progress;
 134    ir_factory factory;
 135    exec_list factory_instructions;
 136
 137    /**
 138     * Determine the needed lowering operation by filtering \a expr_op
 139     * through \ref op_mask.
 140     */
 141    enum lower_packing_builtins_op
 142    choose_lowering_op(ir_expression_operation expr_op)
 143    {
 144       /* C++ regards int and enum as fundamentally different types.
 145        * So, we can't simply return from each case; we must cast the return
 146        * value.
 147        */
 148       int result;
 149
 150       switch (expr_op) {
 151       case ir_unop_pack_snorm_2x16:
 152          result = op_mask & LOWER_PACK_SNORM_2x16;
 153          break;
 154       case ir_unop_pack_snorm_4x8:
 155          result = op_mask & LOWER_PACK_SNORM_4x8;
 156          break;
 157       case ir_unop_pack_unorm_2x16:
 158          result = op_mask & LOWER_PACK_UNORM_2x16;
 159          break;
 160       case ir_unop_pack_unorm_4x8:
 161          result = op_mask & LOWER_PACK_UNORM_4x8;
 162          break;
 163       case ir_unop_pack_half_2x16:
 164          result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
 165          break;
 166       case ir_unop_unpack_snorm_2x16:
 167          result = op_mask & LOWER_UNPACK_SNORM_2x16;
 168          break;
 169       case ir_unop_unpack_snorm_4x8:
 170          result = op_mask & LOWER_UNPACK_SNORM_4x8;
 171          break;
 172       case ir_unop_unpack_unorm_2x16:
 173          result = op_mask & LOWER_UNPACK_UNORM_2x16;
 174          break;
 175       case ir_unop_unpack_unorm_4x8:
 176          result = op_mask & LOWER_UNPACK_UNORM_4x8;
 177          break;
 178       case ir_unop_unpack_half_2x16:
 179          result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
 180          break;
 181       default:
 182          result = LOWER_PACK_UNPACK_NONE;
 183          break;
 184       }
 185
 186       return static_cast<enum lower_packing_builtins_op>(result);
 187    }
 188
 189    void
 190    setup_factory(void *mem_ctx)
 191    {
 192       assert(factory.mem_ctx == NULL);
 193       assert(factory.instructions->is_empty());
 194
 195       factory.mem_ctx = mem_ctx;
 196    }
 197
 198    void
 199    teardown_factory()
 200    {
 201       base_ir->insert_before(factory.instructions);
 202       assert(factory.instructions->is_empty());
 203       factory.mem_ctx = NULL;
 204    }
 205
 206    template <typename T>
 207    ir_constant*
 208    constant(T x)
 209    {
 210       return factory.constant(x);
 211    }
 212
 213    /**
 214     * \brief Pack two uint16's into a single uint32.
 215     *
 216     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
 217     * where the least significant bits specify the first element of the pair.
 218     * Return the uint32.
 219     */
 220    ir_rvalue*
 221    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
 222    {
 223       assert(uvec2_rval->type == glsl_type::uvec2_type);
 224
 225       /* uvec2 u = UVEC2_RVAL; */
 226       ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
 227                                          "tmp_pack_uvec2_to_uint");
 228       factory.emit(assign(u, uvec2_rval));
 229
 230       if (op_mask & LOWER_PACK_USE_BFI) {
 231          return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
 232                                 swizzle_y(u),
 233                                 constant(16u),
 234                                 constant(16u));
 235       }
 236
 237       /* return (u.y << 16) | (u.x & 0xffff); */
 238       return bit_or(lshift(swizzle_y(u), constant(16u)),
 239                     bit_and(swizzle_x(u), constant(0xffffu)));
 240    }
 241
 242    /**
 243     * \brief Pack four uint8's into a single uint32.
 244     *
 245     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
 246     * uint32 where the least significant bits specify the first element of the
 247     * 4-tuple. Return the uint32.
 248     */
 249    ir_rvalue*
 250    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
 251    {
 252       assert(uvec4_rval->type == glsl_type::uvec4_type);
 253
 254       ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
 255                                          "tmp_pack_uvec4_to_uint");
 256
 257       if (op_mask & LOWER_PACK_USE_BFI) {
 258          /* uvec4 u = UVEC4_RVAL; */
 259          factory.emit(assign(u, uvec4_rval));
 260
 261          return bitfield_insert(bitfield_insert(
 262                                    bitfield_insert(
 263                                       bit_and(swizzle_x(u), constant(0xffu)),
 264                                       swizzle_y(u), constant(8u), constant(8u)),
 265                                    swizzle_z(u), constant(16u), constant(8u)),
 266                                 swizzle_w(u), constant(24u), constant(8u));
 267       }
 268
 269       /* uvec4 u = UVEC4_RVAL & 0xff */
 270       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
 271
 272       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
 273       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
 274                            lshift(swizzle_z(u), constant(16u))),
 275                     bit_or(lshift(swizzle_y(u), constant(8u)),
 276                            swizzle_x(u)));
 277    }
 278
 279    /**
 280     * \brief Unpack a uint32 into two uint16's.
 281     *
 282     * Interpret the given uint32 as a uint16 pair where the uint32's least
 283     * significant bits specify the pair's first element. Return the uint16
 284     * pair as a uvec2.
 285     */
 286    ir_rvalue*
 287    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
 288    {
 289       assert(uint_rval->type == glsl_type::uint_type);
 290
 291       /* uint u = UINT_RVAL; */
 292       ir_variable *u = factory.make_temp(glsl_type::uint_type,
 293                                           "tmp_unpack_uint_to_uvec2_u");
 294       factory.emit(assign(u, uint_rval));
 295
 296       /* uvec2 u2; */
 297       ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
 298                                            "tmp_unpack_uint_to_uvec2_u2");
 299
 300       /* u2.x = u & 0xffffu; */
 301       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
 302
 303       /* u2.y = u >> 16u; */
 304       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
 305
 306       return deref(u2).val;
 307    }
 308
 309    /**
 310     * \brief Unpack a uint32 into two int16's.
 311     *
 312     * Specifically each 16-bit value is sign-extended to the full width of an
 313     * int32 on return.
 314     */
 315    ir_rvalue *
 316    unpack_uint_to_ivec2(ir_rvalue *uint_rval)
 317    {
 318       assert(uint_rval->type == glsl_type::uint_type);
 319
 320       if (!(op_mask & LOWER_PACK_USE_BFE)) {
 321          return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
 322                               constant(16u)),
 323                        constant(16u));
 324       }
 325
 326       ir_variable *i = factory.make_temp(glsl_type::int_type,
 327                                          "tmp_unpack_uint_to_ivec2_i");
 328       factory.emit(assign(i, u2i(uint_rval)));
 329
 330       /* ivec2 i2; */
 331       ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
 332                                           "tmp_unpack_uint_to_ivec2_i2");
 333
 334       factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
 335                           WRITEMASK_X));
 336       factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
 337                           WRITEMASK_Y));
 338
 339       return deref(i2).val;
 340    }
 341
 342    /**
 343     * \brief Unpack a uint32 into four uint8's.
 344     *
 345     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
 346     * significant bits specify the 4-tuple's first element. Return the uint8
 347     * 4-tuple as a uvec4.
 348     */
 349    ir_rvalue*
 350    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
 351    {
 352       assert(uint_rval->type == glsl_type::uint_type);
 353
 354       /* uint u = UINT_RVAL; */
 355       ir_variable *u = factory.make_temp(glsl_type::uint_type,
 356                                           "tmp_unpack_uint_to_uvec4_u");
 357       factory.emit(assign(u, uint_rval));
 358
 359       /* uvec4 u4; */
 360       ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
 361                                            "tmp_unpack_uint_to_uvec4_u4");
 362
 363       /* u4.x = u & 0xffu; */
 364       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
 365
 366       if (op_mask & LOWER_PACK_USE_BFE) {
 367          /* u4.y = bitfield_extract(u, 8, 8); */
 368          factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
 369                              WRITEMASK_Y));
 370
 371          /* u4.z = bitfield_extract(u, 16, 8); */
 372          factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
 373                              WRITEMASK_Z));
 374       } else {
 375          /* u4.y = (u >> 8u) & 0xffu; */
 376          factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
 377                                          constant(0xffu)), WRITEMASK_Y));
 378
 379          /* u4.z = (u >> 16u) & 0xffu; */
 380          factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
 381                                          constant(0xffu)), WRITEMASK_Z));
 382       }
 383
 384       /* u4.w = (u >> 24u) */
 385       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
 386
 387       return deref(u4).val;
 388    }
 389
 390    /**
 391     * \brief Unpack a uint32 into four int8's.
 392     *
 393     * Specifically each 8-bit value is sign-extended to the full width of an
 394     * int32 on return.
 395     */
 396    ir_rvalue *
 397    unpack_uint_to_ivec4(ir_rvalue *uint_rval)
 398    {
 399       assert(uint_rval->type == glsl_type::uint_type);
 400
 401       if (!(op_mask & LOWER_PACK_USE_BFE)) {
 402          return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
 403                               constant(24u)),
 404                        constant(24u));
 405       }
 406
 407       ir_variable *i = factory.make_temp(glsl_type::int_type,
 408                                          "tmp_unpack_uint_to_ivec4_i");
 409       factory.emit(assign(i, u2i(uint_rval)));
 410
 411       /* ivec4 i4; */
 412       ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
 413                                           "tmp_unpack_uint_to_ivec4_i4");
 414
 415       factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
 416                           WRITEMASK_X));
 417       factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
 418                           WRITEMASK_Y));
 419       factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
 420                           WRITEMASK_Z));
 421       factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
 422                           WRITEMASK_W));
 423
 424       return deref(i4).val;
 425    }
 426
 427    /**
 428     * \brief Lower a packSnorm2x16 expression.
 429     *
 430     * \param vec2_rval is packSnorm2x16's input
 431     * \return packSnorm2x16's output as a uint rvalue
 432     */
 433    ir_rvalue*
 434    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
 435    {
 436       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 437        *
 438        *    highp uint packSnorm2x16(vec2 v)
 439        *    --------------------------------
 440        *    First, converts each component of the normalized floating-point value
 441        *    v into 16-bit integer values. Then, the results are packed into the
 442        *    returned 32-bit unsigned integer.
 443        *
 444        *    The conversion for component c of v to fixed point is done as
 445        *    follows:
 446        *
 447        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
 448        *
 449        *    The first component of the vector will be written to the least
 450        *    significant bits of the output; the last component will be written to
 451        *    the most significant bits.
 452        *
 453        * This function generates IR that approximates the following pseudo-GLSL:
 454        *
 455        *     return pack_uvec2_to_uint(
 456        *         uvec2(ivec2(
 457        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
 458        *
 459        * It is necessary to first convert the vec2 to ivec2 rather than directly
 460        * converting vec2 to uvec2 because the latter conversion is undefined.
 461        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
 462        * convert a negative floating point value to an uint".
 463        */
 464       assert(vec2_rval->type == glsl_type::vec2_type);
 465
 466       ir_rvalue *result = pack_uvec2_to_uint(
 467             i2u(f2i(round_even(mul(clamp(vec2_rval,
 468                                          constant(-1.0f),
 469                                          constant(1.0f)),
 470                                    constant(32767.0f))))));
 471
 472       assert(result->type == glsl_type::uint_type);
 473       return result;
 474    }
 475
 476    /**
 477     * \brief Lower a packSnorm4x8 expression.
 478     *
 479     * \param vec4_rval is packSnorm4x8's input
 480     * \return packSnorm4x8's output as a uint rvalue
 481     */
 482    ir_rvalue*
 483    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
 484    {
 485       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 486        *
 487        *    highp uint packSnorm4x8(vec4 v)
 488        *    -------------------------------
 489        *    First, converts each component of the normalized floating-point value
 490        *    v into 8-bit integer values. Then, the results are packed into the
 491        *    returned 32-bit unsigned integer.
 492        *
 493        *    The conversion for component c of v to fixed point is done as
 494        *    follows:
 495        *
 496        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
 497        *
 498        *    The first component of the vector will be written to the least
 499        *    significant bits of the output; the last component will be written to
 500        *    the most significant bits.
 501        *
 502        * This function generates IR that approximates the following pseudo-GLSL:
 503        *
 504        *     return pack_uvec4_to_uint(
 505        *         uvec4(ivec4(
 506        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
 507        *
 508        * It is necessary to first convert the vec4 to ivec4 rather than directly
 509        * converting vec4 to uvec4 because the latter conversion is undefined.
 510        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
 511        * convert a negative floating point value to an uint".
 512        */
 513       assert(vec4_rval->type == glsl_type::vec4_type);
 514
 515       ir_rvalue *result = pack_uvec4_to_uint(
 516             i2u(f2i(round_even(mul(clamp(vec4_rval,
 517                                          constant(-1.0f),
 518                                          constant(1.0f)),
 519                                    constant(127.0f))))));
 520
 521       assert(result->type == glsl_type::uint_type);
 522       return result;
 523    }
 524
 525    /**
 526     * \brief Lower an unpackSnorm2x16 expression.
 527     *
 528     * \param uint_rval is unpackSnorm2x16's input
 529     * \return unpackSnorm2x16's output as a vec2 rvalue
 530     */
 531    ir_rvalue*
 532    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
 533    {
 534       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 535        *
 536        *    highp vec2 unpackSnorm2x16 (highp uint p)
 537        *    -----------------------------------------
 538        *    First, unpacks a single 32-bit unsigned integer p into a pair of
 539        *    16-bit unsigned integers. Then, each component is converted to
 540        *    a normalized floating-point value to generate the returned
 541        *    two-component vector.
 542        *
 543        *    The conversion for unpacked fixed-point value f to floating point is
 544        *    done as follows:
 545        *
 546        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
 547        *
 548        *    The first component of the returned vector will be extracted from the
 549        *    least significant bits of the input; the last component will be
 550        *    extracted from the most significant bits.
 551        *
 552        * This function generates IR that approximates the following pseudo-GLSL:
 553        *
 554        *    return clamp(
 555        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
 556        *       -1.0f, 1.0f);
 557        *
 558        * The above IR may appear unnecessarily complex, but the intermediate
 559        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
 560        * negative floats.
 561        *
 562        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
 563        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
 564        * place that int16 into an int32, which results in the *positive* integer
 565        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
 566        * unimportant bit 16. We must now extend the int16's sign bit into bits
 567        * 17-32, which is accomplished by left-shifting then right-shifting.
 568        */
 569
 570       assert(uint_rval->type == glsl_type::uint_type);
 571
 572       ir_rvalue *result =
 573         clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
 574                   constant(32767.0f)),
 575               constant(-1.0f),
 576               constant(1.0f));
 577
 578       assert(result->type == glsl_type::vec2_type);
 579       return result;
 580    }
 581
 582    /**
 583     * \brief Lower an unpackSnorm4x8 expression.
 584     *
 585     * \param uint_rval is unpackSnorm4x8's input
 586     * \return unpackSnorm4x8's output as a vec4 rvalue
 587     */
 588    ir_rvalue*
 589    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
 590    {
 591       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 592        *
 593        *    highp vec4 unpackSnorm4x8 (highp uint p)
 594        *    ----------------------------------------
 595        *    First, unpacks a single 32-bit unsigned integer p into four
 596        *    8-bit unsigned integers. Then, each component is converted to
 597        *    a normalized floating-point value to generate the returned
 598        *    four-component vector.
 599        *
 600        *    The conversion for unpacked fixed-point value f to floating point is
 601        *    done as follows:
 602        *
 603        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
 604        *
 605        *    The first component of the returned vector will be extracted from the
 606        *    least significant bits of the input; the last component will be
 607        *    extracted from the most significant bits.
 608        *
 609        * This function generates IR that approximates the following pseudo-GLSL:
 610        *
 611        *    return clamp(
 612        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
 613        *       -1.0f, 1.0f);
 614        *
 615        * The above IR may appear unnecessarily complex, but the intermediate
 616        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
 617        * negative floats.
 618        *
 619        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
 620        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
 621        * place that int8 into an int32, which results in the *positive* integer
 622        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
 623        * unimportant bit 8. We must now extend the int8's sign bit into bits
 624        * 9-32, which is accomplished by left-shifting then right-shifting.
 625        */
 626
 627       assert(uint_rval->type == glsl_type::uint_type);
 628
 629       ir_rvalue *result =
 630         clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
 631                   constant(127.0f)),
 632               constant(-1.0f),
 633               constant(1.0f));
 634
 635       assert(result->type == glsl_type::vec4_type);
 636       return result;
 637    }
 638
 639    /**
 640     * \brief Lower a packUnorm2x16 expression.
 641     *
 642     * \param vec2_rval is packUnorm2x16's input
 643     * \return packUnorm2x16's output as a uint rvalue
 644     */
 645    ir_rvalue*
 646    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
 647    {
 648       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
 649        *
 650        *    highp uint packUnorm2x16 (vec2 v)
 651        *    ---------------------------------
 652        *    First, converts each component of the normalized floating-point value
 653        *    v into 16-bit integer values. Then, the results are packed into the
 654        *    returned 32-bit unsigned integer.
 655        *
 656        *    The conversion for component c of v to fixed point is done as
 657        *    follows:
 658        *
 659        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
 660        *
 661        *    The first component of the vector will be written to the least
 662        *    significant bits of the output; the last component will be written to
 663        *    the most significant bits.
 664        *
 665        * This function generates IR that approximates the following pseudo-GLSL:
 666        *
 667        *     return pack_uvec2_to_uint(uvec2(
 668        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
 669        *
 670        * Here it is safe to directly convert the vec2 to uvec2 because the vec2
 671        * has been clamped to a non-negative range.
 672        */
 673
 674       assert(vec2_rval->type == glsl_type::vec2_type);
 675
 676       ir_rvalue *result = pack_uvec2_to_uint(
 677          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
 678
 679       assert(result->type == glsl_type::uint_type);
 680       return result;
 681    }
 682
 683    /**
 684     * \brief Lower a packUnorm4x8 expression.
 685     *
 686     * \param vec4_rval is packUnorm4x8's input
 687     * \return packUnorm4x8's output as a uint rvalue
 688     */
 689    ir_rvalue*
 690    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
 691    {
 692       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 693        *
 694        *    highp uint packUnorm4x8 (vec4 v)
 695        *    --------------------------------
 696        *    First, converts each component of the normalized floating-point value
 697        *    v into 8-bit integer values. Then, the results are packed into the
 698        *    returned 32-bit unsigned integer.
 699        *
 700        *    The conversion for component c of v to fixed point is done as
 701        *    follows:
 702        *
 703        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
 704        *
 705        *    The first component of the vector will be written to the least
 706        *    significant bits of the output; the last component will be written to
 707        *    the most significant bits.
 708        *
 709        * This function generates IR that approximates the following pseudo-GLSL:
 710        *
 711        *     return pack_uvec4_to_uint(uvec4(
 712        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
 713        *
 714        * Here it is safe to directly convert the vec4 to uvec4 because the vec4
 715        * has been clamped to a non-negative range.
 716        */
 717
 718       assert(vec4_rval->type == glsl_type::vec4_type);
 719
 720       ir_rvalue *result = pack_uvec4_to_uint(
 721          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
 722
 723       assert(result->type == glsl_type::uint_type);
 724       return result;
 725    }
 726
 727    /**
 728     * \brief Lower an unpackUnorm2x16 expression.
 729     *
 730     * \param uint_rval is unpackUnorm2x16's input
 731     * \return unpackUnorm2x16's output as a vec2 rvalue
 732     */
 733    ir_rvalue*
 734    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
 735    {
 736       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
 737        *
 738        *    highp vec2 unpackUnorm2x16 (highp uint p)
 739        *    -----------------------------------------
 740        *    First, unpacks a single 32-bit unsigned integer p into a pair of
 741        *    16-bit unsigned integers. Then, each component is converted to
 742        *    a normalized floating-point value to generate the returned
 743        *    two-component vector.
 744        *
 745        *    The conversion for unpacked fixed-point value f to floating point is
 746        *    done as follows:
 747        *
 748        *       unpackUnorm2x16: f / 65535.0
 749        *
 750        *    The first component of the returned vector will be extracted from the
 751        *    least significant bits of the input; the last component will be
 752        *    extracted from the most significant bits.
 753        *
 754        * This function generates IR that approximates the following pseudo-GLSL:
 755        *
 756        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
 757        */
 758
 759       assert(uint_rval->type == glsl_type::uint_type);
 760
 761       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
 762                               constant(65535.0f));
 763
 764       assert(result->type == glsl_type::vec2_type);
 765       return result;
 766    }
 767
 768    /**
 769     * \brief Lower an unpackUnorm4x8 expression.
 770     *
 771     * \param uint_rval is unpackUnorm4x8's input
 772     * \return unpackUnorm4x8's output as a vec4 rvalue
 773     */
 774    ir_rvalue*
 775    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
 776    {
 777       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
 778        *
 779        *    highp vec4 unpackUnorm4x8 (highp uint p)
 780        *    ----------------------------------------
 781        *    First, unpacks a single 32-bit unsigned integer p into four
 782        *    8-bit unsigned integers. Then, each component is converted to
 783        *    a normalized floating-point value to generate the returned
 784        *    two-component vector.
 785        *
 786        *    The conversion for unpacked fixed-point value f to floating point is
 787        *    done as follows:
 788        *
 789        *       unpackUnorm4x8: f / 255.0
 790        *
 791        *    The first component of the returned vector will be extracted from the
 792        *    least significant bits of the input; the last component will be
 793        *    extracted from the most significant bits.
 794        *
 795        * This function generates IR that approximates the following pseudo-GLSL:
 796        *
 797        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
 798        */
 799
 800       assert(uint_rval->type == glsl_type::uint_type);
 801
 802       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
 803                               constant(255.0f));
 804
 805       assert(result->type == glsl_type::vec4_type);
 806       return result;
 807    }
 808
 809    /**
 810     * \brief Lower the component-wise calculation of packHalf2x16.
 811     *
 812     * \param f_rval is one component of packHafl2x16's input
 813     * \param e_rval is the unshifted exponent bits of f_rval
 814     * \param m_rval is the unshifted mantissa bits of f_rval
 815     *
 816     * \return a uint rvalue that encodes a float16 in its lower 16 bits
 817     */
 818    ir_rvalue*
 819    pack_half_1x16_nosign(ir_rvalue *f_rval,
 820                          ir_rvalue *e_rval,
 821                          ir_rvalue *m_rval)
 822    {
 823       assert(e_rval->type == glsl_type::uint_type);
 824       assert(m_rval->type == glsl_type::uint_type);
 825
 826       /* uint u16; */
 827       ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
 828                                            "tmp_pack_half_1x16_u16");
 829
 830       /* float f = FLOAT_RVAL; */
 831       ir_variable *f = factory.make_temp(glsl_type::float_type,
 832                                           "tmp_pack_half_1x16_f");
 833       factory.emit(assign(f, f_rval));
 834
 835       /* uint e = E_RVAL; */
 836       ir_variable *e = factory.make_temp(glsl_type::uint_type,
 837                                           "tmp_pack_half_1x16_e");
 838       factory.emit(assign(e, e_rval));
 839
 840       /* uint m = M_RVAL; */
 841       ir_variable *m = factory.make_temp(glsl_type::uint_type,
 842                                           "tmp_pack_half_1x16_m");
 843       factory.emit(assign(m, m_rval));
 844
 845       /* Preliminaries
 846        * -------------
 847        *
 848        * For a float16, the bit layout is:
 849        *
 850        *   sign:     15
 851        *   exponent: 10:14
 852        *   mantissa: 0:9
 853        *
 854        * Let f16 be a float16 value. The sign, exponent, and mantissa
 855        * determine its value thus:
 856        *
 857        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
 858        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
 859        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
 860        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
 861        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
 862        *
 863        * where 0 <= m16 < 2^10.
 864        *
 865        * For a float32, the bit layout is:
 866        *
 867        *   sign:     31
 868        *   exponent: 23:30
 869        *   mantissa: 0:22
 870        *
 871        * Let f32 be a float32 value. The sign, exponent, and mantissa
 872        * determine its value thus:
 873        *
 874        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
 875        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
 876        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
 877        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
 878        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
 879        *
 880        * where 0 <= m32 < 2^23.
 881        *
 882        * The minimum and maximum normal float16 values are
 883        *
 884        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
 885        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
 886        *
 887        * The step at max_norm16 is
 888        *
 889        *   max_step16 = 2^5                                     (22)
 890        *
 891        * Observe that the float16 boundary values in equations 20-21 lie in the
 892        * range of normal float32 values.
 893        *
 894        *
 895        * Rounding Behavior
 896        * -----------------
 897        * Not all float32 values can be exactly represented as a float16. We
 898        * round all such intermediate float32 values to the nearest float16; if
 899        * the float32 is exactly between to float16 values, we round to the one
 900        * with an even mantissa. This rounding behavior has several benefits:
 901        *
 902        *   - It has no sign bias.
 903        *
 904        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
 905        *     GPU ISA.
 906        *
 907        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
 908        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
 909        *     result in the same value as if the expression were executed on the
 910        *     GPU.
 911        *
 912        * Calculation
 913        * -----------
 914        * Our task is to compute s16, e16, m16 given f32.  Since this function
 915        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
 916        * cases consider.
 917        */
 918
 919       factory.emit(
 920
 921          /* Case 1) f32 is NaN
 922           *
 923           *   The resultant f16 will also be NaN.
 924           */
 925
 926          /* if (e32 == 255 && m32 != 0) { */
 927          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
 928                            logic_not(equal(m, constant(0u)))),
 929
 930             assign(u16, constant(0x7fffu)),
 931
 932          /* Case 2) f32 lies in the range [0, min_norm16).
 933           *
 934           *   The resultant float16 will be either zero, subnormal, or normal.
 935           *
 936           *   Solving
 937           *
 938           *     f32 = min_norm16       (30)
 939           *
 940           *   gives
 941           *
 942           *     e32 = 113 and m32 = 0  (31)
 943           *
 944           *   Therefore this case occurs if and only if
 945           *
 946           *     e32 < 113              (32)
 947           */
 948
 949          /* } else if (e32 < 113) { */
 950          if_tree(less(e, constant(113u << 23u)),
 951
 952             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
 953             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
 954                                            constant((float) (1 << 24)))))),
 955
 956          /* Case 3) f32 lies in the range
 957           *         [min_norm16, max_norm16 + max_step16).
 958           *
 959           *   The resultant float16 will be either normal or infinite.
 960           *
 961           *   Solving
 962           *
 963           *     f32 = max_norm16 + max_step16           (40)
 964           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
 965           *         = 2^16                              (42)
 966           *   gives
 967           *
 968           *     e32 = 143 and m32 = 0                   (43)
 969           *
 970           *   We already solved the boundary condition f32 = min_norm16 above
 971           *   in equation 31. Therefore this case occurs if and only if
 972           *
 973           *     113 <= e32 and e32 < 143
 974           */
 975
 976          /* } else if (e32 < 143) { */
 977          if_tree(less(e, constant(143u << 23u)),
 978
 979             /* The addition below handles the case where the mantissa rounds
 980              * up to 1024 and bumps the exponent.
 981              *
 982              * u16 = ((e - (112u << 23u)) >> 13u)
 983              *     + round_to_even((float(m) / (1u << 13u));
 984              */
 985             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
 986                                    constant(13u)),
 987                             f2u(round_even(
 988                                   div(u2f(m), constant((float) (1 << 13))))))),
 989
 990          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
 991           *
 992           *   The resultant float16 will be infinite.
 993           *
 994           *   The cases above caught all float32 values in the range
 995           *   [0, max_norm16 + max_step16), so this is the fall-through case.
 996           */
 997
 998          /* } else { */
 999
1000             assign(u16, constant(31u << 10u))))));
1001
1002          /* } */
1003
1004        return deref(u16).val;
1005    }
1006
1007    /**
1008     * \brief Lower a packHalf2x16 expression.
1009     *
1010     * \param vec2_rval is packHalf2x16's input
1011     * \return packHalf2x16's output as a uint rvalue
1012     */
1013    ir_rvalue*
1014    lower_pack_half_2x16(ir_rvalue *vec2_rval)
1015    {
1016       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1017        *
1018        *    highp uint packHalf2x16 (mediump vec2 v)
1019        *    ----------------------------------------
1020        *    Returns an unsigned integer obtained by converting the components of
1021        *    a two-component floating-point vector to the 16-bit floating-point
1022        *    representation found in the OpenGL ES Specification, and then packing
1023        *    these two 16-bit integers into a 32-bit unsigned integer.
1024        *
1025        *    The first vector component specifies the 16 least- significant bits
1026        *    of the result; the second component specifies the 16 most-significant
1027        *    bits.
1028        */
1029
1030       assert(vec2_rval->type == glsl_type::vec2_type);
1031
1032       /* vec2 f = VEC2_RVAL; */
1033       ir_variable *f = factory.make_temp(glsl_type::vec2_type,
1034                                          "tmp_pack_half_2x16_f");
1035       factory.emit(assign(f, vec2_rval));
1036
1037       /* uvec2 f32 = bitcast_f2u(f); */
1038       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1039                                             "tmp_pack_half_2x16_f32");
1040       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1041
1042       /* uvec2 f16; */
1043       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1044                                         "tmp_pack_half_2x16_f16");
1045
1046       /* Get f32's unshifted exponent bits.
1047        *
1048        *   uvec2 e = f32 & 0x7f800000u;
1049        */
1050       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1051                                           "tmp_pack_half_2x16_e");
1052       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1053
1054       /* Get f32's unshifted mantissa bits.
1055        *
1056        *   uvec2 m = f32 & 0x007fffffu;
1057        */
1058       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1059                                           "tmp_pack_half_2x16_m");
1060       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1061
1062       /* Set f16's exponent and mantissa bits.
1063        *
1064        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
1065        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
1066        */
1067       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1068                                                      swizzle_x(e),
1069                                                      swizzle_x(m)),
1070                            WRITEMASK_X));
1071       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1072                                                      swizzle_y(e),
1073                                                      swizzle_y(m)),
1074                            WRITEMASK_Y));
1075
1076       /* Set f16's sign bits.
1077        *
1078        *   f16 |= (f32 & (1u << 31u) >> 16u;
1079        */
1080       factory.emit(
1081          assign(f16, bit_or(f16,
1082                             rshift(bit_and(f32, constant(1u << 31u)),
1083                                    constant(16u)))));
1084
1085
1086       /* return (f16.y << 16u) | f16.x; */
1087       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1088                                         constant(16u)),
1089                                  swizzle_x(f16));
1090
1091       assert(result->type == glsl_type::uint_type);
1092       return result;
1093    }
1094
1095    /**
1096     * \brief Split packHalf2x16's vec2 operand into two floats.
1097     *
1098     * \param vec2_rval is packHalf2x16's input
1099     * \return a uint rvalue
1100     *
1101     * Some code generators, such as the i965 fragment shader, require that all
1102     * vector expressions be lowered to a sequence of scalar expressions.
1103     * However, packHalf2x16 cannot be scalarized by the same mechanism as
1104     * a true vector operation because its input and output have a differing
1105     * number of vector components.
1106     *
1107     * This method scalarizes packHalf2x16 by transforming it from an unary
1108     * operation having vector input to a binary operation having scalar input.
1109     * That is, it transforms
1110     *
1111     *    packHalf2x16(VEC2_RVAL);
1112     *
1113     * into
1114     *
1115     *    vec2 v = VEC2_RVAL;
1116     *    return packHalf2x16_split(v.x, v.y);
1117     */
1118    ir_rvalue*
1119    split_pack_half_2x16(ir_rvalue *vec2_rval)
1120    {
1121       assert(vec2_rval->type == glsl_type::vec2_type);
1122
1123       ir_variable *v = factory.make_temp(glsl_type::vec2_type,
1124                                          "tmp_split_pack_half_2x16_v");
1125       factory.emit(assign(v, vec2_rval));
1126
1127       return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
1128    }
1129
1130    /**
1131     * \brief Lower the component-wise calculation of unpackHalf2x16.
1132     *
1133     * Given a uint that encodes a float16 in its lower 16 bits, this function
1134     * returns a uint that encodes a float32 with the same value. The sign bit
1135     * of the float16 is ignored.
1136     *
1137     * \param e_rval is the unshifted exponent bits of a float16
1138     * \param m_rval is the unshifted mantissa bits of a float16
1139     * \param a uint rvalue that encodes a float32
1140     */
1141    ir_rvalue*
1142    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1143    {
1144       assert(e_rval->type == glsl_type::uint_type);
1145       assert(m_rval->type == glsl_type::uint_type);
1146
1147       /* uint u32; */
1148       ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1149                                            "tmp_unpack_half_1x16_u32");
1150
1151       /* uint e = E_RVAL; */
1152       ir_variable *e = factory.make_temp(glsl_type::uint_type,
1153                                           "tmp_unpack_half_1x16_e");
1154       factory.emit(assign(e, e_rval));
1155
1156       /* uint m = M_RVAL; */
1157       ir_variable *m = factory.make_temp(glsl_type::uint_type,
1158                                           "tmp_unpack_half_1x16_m");
1159       factory.emit(assign(m, m_rval));
1160
1161       /* Preliminaries
1162        * -------------
1163        *
1164        * For a float16, the bit layout is:
1165        *
1166        *   sign:     15
1167        *   exponent: 10:14
1168        *   mantissa: 0:9
1169        *
1170        * Let f16 be a float16 value. The sign, exponent, and mantissa
1171        * determine its value thus:
1172        *
1173        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1174        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1175        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1176        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1177        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1178        *
1179        * where 0 <= m16 < 2^10.
1180        *
1181        * For a float32, the bit layout is:
1182        *
1183        *   sign: 31
1184        *   exponent: 23:30
1185        *   mantissa: 0:22
1186        *
1187        * Let f32 be a float32 value. The sign, exponent, and mantissa
1188        * determine its value thus:
1189        *
1190        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1191        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1192        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1193        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1194        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1195        *
1196        * where 0 <= m32 < 2^23.
1197        *
1198        * Calculation
1199        * -----------
1200        * Our task is to compute s32, e32, m32 given f16.  Since this function
1201        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1202        * cases consider.
1203        */
1204
1205       factory.emit(
1206
1207          /* Case 1) f16 is zero or subnormal.
1208           *
1209           *   The simplest method of calcuating f32 in this case is
1210           *
1211           *     f32 = f16                       (20)
1212           *         = 2^(-14) * (m16 / 2^10)    (21)
1213           *         = m16 / 2^(-24)             (22)
1214           */
1215
1216          /* if (e16 == 0) { */
1217          if_tree(equal(e, constant(0u)),
1218
1219             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1220             assign(u32, expr(ir_unop_bitcast_f2u,
1221                                 div(u2f(m), constant((float)(1 << 24))))),
1222
1223          /* Case 2) f16 is normal.
1224           *
1225           *   The equation
1226           *
1227           *     f32 = f16                              (30)
1228           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1229           *       2^(e16 - 15) * (1 + m16 / 2^10)
1230           *
1231           *   can be decomposed into two
1232           *
1233           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1234           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1235           *
1236           *   which solve to
1237           *
1238           *     e32 = e16 + 112                        (34)
1239           *     m32 = m16 * 2^13                       (35)
1240           */
1241
1242          /* } else if (e16 < 31)) { */
1243          if_tree(less(e, constant(31u << 10u)),
1244
1245               /* u32 = ((e + (112 << 10)) | m) << 13;
1246                */
1247               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1248                                  constant(13u))),
1249
1250
1251          /* Case 3) f16 is infinite. */
1252          if_tree(equal(m, constant(0u)),
1253
1254                  assign(u32, constant(255u << 23u)),
1255
1256          /* Case 4) f16 is NaN. */
1257          /* } else { */
1258
1259             assign(u32, constant(0x7fffffffu))))));
1260
1261          /* } */
1262
1263       return deref(u32).val;
1264    }
1265
1266    /**
1267     * \brief Lower an unpackHalf2x16 expression.
1268     *
1269     * \param uint_rval is unpackHalf2x16's input
1270     * \return unpackHalf2x16's output as a vec2 rvalue
1271     */
1272    ir_rvalue*
1273    lower_unpack_half_2x16(ir_rvalue *uint_rval)
1274    {
1275       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1276        *
1277        *    mediump vec2 unpackHalf2x16 (highp uint v)
1278        *    ------------------------------------------
1279        *    Returns a two-component floating-point vector with components
1280        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1281        *    values, interpreting those values as 16-bit floating-point numbers
1282        *    according to the OpenGL ES Specification, and converting them to
1283        *    32-bit floating-point values.
1284        *
1285        *    The first component of the vector is obtained from the
1286        *    16 least-significant bits of v; the second component is obtained
1287        *    from the 16 most-significant bits of v.
1288        */
1289       assert(uint_rval->type == glsl_type::uint_type);
1290
1291       /* uint u = RVALUE;
1292        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1293        */
1294       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1295                                             "tmp_unpack_half_2x16_f16");
1296       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1297
1298       /* uvec2 f32; */
1299       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1300                                             "tmp_unpack_half_2x16_f32");
1301
1302       /* Get f16's unshifted exponent bits.
1303        *
1304        *    uvec2 e = f16 & 0x7c00u;
1305        */
1306       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1307                                           "tmp_unpack_half_2x16_e");
1308       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1309
1310       /* Get f16's unshifted mantissa bits.
1311        *
1312        *    uvec2 m = f16 & 0x03ffu;
1313        */
1314       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1315                                           "tmp_unpack_half_2x16_m");
1316       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1317
1318       /* Set f32's exponent and mantissa bits.
1319        *
1320        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1321        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1322        */
1323       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1324                                                        swizzle_x(m)),
1325                            WRITEMASK_X));
1326       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1327                                                        swizzle_y(m)),
1328                            WRITEMASK_Y));
1329
1330       /* Set f32's sign bit.
1331        *
1332        *    f32 |= (f16 & 0x8000u) << 16u;
1333        */
1334       factory.emit(assign(f32, bit_or(f32,
1335                                        lshift(bit_and(f16,
1336                                                       constant(0x8000u)),
1337                                               constant(16u)))));
1338
1339       /* return bitcast_u2f(f32); */
1340       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1341       assert(result->type == glsl_type::vec2_type);
1342       return result;
1343    }
1344
1345    /**
1346     * \brief Split unpackHalf2x16 into two operations.
1347     *
1348     * \param uint_rval is unpackHalf2x16's input
1349     * \return a vec2 rvalue
1350     *
1351     * Some code generators, such as the i965 fragment shader, require that all
1352     * vector expressions be lowered to a sequence of scalar expressions.
1353     * However, unpackHalf2x16 cannot be scalarized by the same method as
1354     * a true vector operation because the number of components of its input
1355     * and output differ.
1356     *
1357     * This method scalarizes unpackHalf2x16 by transforming it from a single
1358     * operation having vec2 output to a pair of operations each having float
1359     * output. That is, it transforms
1360     *
1361     *   unpackHalf2x16(UINT_RVAL)
1362     *
1363     * into
1364     *
1365     *   uint u = UINT_RVAL;
1366     *   vec2 v;
1367     *
1368     *   v.x = unpackHalf2x16_split_x(u);
1369     *   v.y = unpackHalf2x16_split_y(u);
1370     *
1371     *   return v;
1372     */
1373    ir_rvalue*
1374    split_unpack_half_2x16(ir_rvalue *uint_rval)
1375    {
1376       assert(uint_rval->type == glsl_type::uint_type);
1377
1378       /* uint u = uint_rval; */
1379       ir_variable *u = factory.make_temp(glsl_type::uint_type,
1380                                           "tmp_split_unpack_half_2x16_u");
1381       factory.emit(assign(u, uint_rval));
1382
1383       /* vec2 v; */
1384       ir_variable *v = factory.make_temp(glsl_type::vec2_type,
1385                                           "tmp_split_unpack_half_2x16_v");
1386
1387       /* v.x = unpack_half_2x16_split_x(u); */
1388       factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
1389                            WRITEMASK_X));
1390
1391       /* v.y = unpack_half_2x16_split_y(u); */
1392       factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
1393                            WRITEMASK_Y));
1394
1395       return deref(v).val;
1396    }
1397 };
1398
1399 } // namespace anonymous
1400
1401 /**
1402  * \brief Lower the builtin packing functions.
1403  *
1404  * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1405  */
1406 bool
1407 lower_packing_builtins(exec_list *instructions, int op_mask)
1408 {
1409    lower_packing_builtins_visitor v(op_mask);
1410    visit_list_elements(&v, instructions, true);
1411    return v.get_progress();
1412 }