From 89285e4d47a65e52547180dca46ecfd81b2996e9 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 23 Jan 2015 13:38:46 -0800
Subject: [PATCH] nir: add new constant folding infrastructure

Add a required field to the Opcode class, const_expr, that contains an
expression or statement that computes the result of the opcode given known
constant inputs. Then take those const_expr's and expand them into a function
that takes an opcode and an array of constant inputs and spits out the constant
result. This means that when adding opcodes, there's one less place to update,
and almost all the opcodes are self-documenting since the information on how to
compute the result is right next to the definition.

The helper functions in nir_constant_expressions.c were taken from
ir_constant_expressions.cpp.

v3 Jason Ekstrand <jason.ekstrand@iastate.edu>
 - Use mako to generate one function per opcode instead of doing piles of
   string splicing

v4 Jason Ekstrand <jason.ekstrand@iastate.edu>
 - More comments and better indentation in the mako
 - Add a description of the constant expression language in nir_opcodes.py
 - Added nir_constant_expressions.py to EXTRA_DIST in Makefile.am

Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/Makefile.am                     |   6 +
 src/glsl/Makefile.sources                |   1 +
 src/glsl/nir/.gitignore                  |   1 +
 src/glsl/nir/nir_constant_expressions.h  |  31 ++
 src/glsl/nir/nir_constant_expressions.py | 352 ++++++++++++++
 src/glsl/nir/nir_opcodes.py              | 580 ++++++++++++++++-------
 6 files changed, 787 insertions(+), 184 deletions(-)
 create mode 100644 src/glsl/nir/nir_constant_expressions.h
 create mode 100644 src/glsl/nir/nir_constant_expressions.py

diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 1691c46c4fe..e89a9ad5d89 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -40,6 +40,7 @@ EXTRA_DIST = tests glcpp/tests README TODO glcpp/README	\
 	glcpp/glcpp-lex.l				\
 	glcpp/glcpp-parse.y				\
 	nir/nir_algebraic.py				\
+	nir/nir_constant_expressions.py			\
 	nir/nir_opcodes.py				\
 	nir/nir_opcodes_c.py				\
 	nir/nir_opcodes_h.py				\
@@ -215,6 +216,7 @@ BUILT_SOURCES =						\
 	glsl_lexer.cpp					\
 	glcpp/glcpp-parse.c				\
 	glcpp/glcpp-lex.c				\
+	nir/nir_constant_expressions.c			\
 	nir/nir_opcodes.c				\
 	nir/nir_opcodes.h				\
 	nir/nir_opt_algebraic.c
@@ -230,6 +232,10 @@ dist-hook:
 	$(RM) glcpp/tests/*.out
 	$(RM) glcpp/tests/subtest*/*.out
 
+nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py nir/nir_constant_expressions.h
+	$(MKDIR_P) nir;							\
+	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_constant_expressions.py > $@
+
 nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
 	$(MKDIR_P) nir;							\
 	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_h.py > $@
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 97c637ebdc3..face22ec680 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -11,6 +11,7 @@ LIBGLCPP_GENERATED_FILES = \
 	glcpp/glcpp-parse.c
 
 NIR_GENERATED_FILES = \
+	nir/nir_constant_expressions.c \
 	nir/nir_opcodes.c \
 	nir/nir_opcodes.h \
 	nir/nir_opt_algebraic.c
diff --git a/src/glsl/nir/.gitignore b/src/glsl/nir/.gitignore
index 4c28193612f..261f64f7188 100644
--- a/src/glsl/nir/.gitignore
+++ b/src/glsl/nir/.gitignore
@@ -1,3 +1,4 @@
 nir_opt_algebraic.c
 nir_opcodes.c
 nir_opcodes.h
+nir_constant_expressions.c
diff --git a/src/glsl/nir/nir_constant_expressions.h b/src/glsl/nir/nir_constant_expressions.h
new file mode 100644
index 00000000000..97997f2e514
--- /dev/null
+++ b/src/glsl/nir/nir_constant_expressions.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright Â© 2014 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott (cwabbott0@gmail.com)
+ *
+ */
+
+#include "nir.h"
+
+nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
+                                      nir_const_value *src);
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
new file mode 100644
index 00000000000..22bc4f09583
--- /dev/null
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -0,0 +1,352 @@
+#! /usr/bin/python2
+template = """\
+/*
+ * Copyright (C) 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ */
+
+#include <math.h>
+#include "main/core.h"
+#include "nir_constant_expressions.h"
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+static int isnormal(double x)
+{
+   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
+}
+#elif defined(__SUNPRO_CC)
+#include <ieeefp.h>
+static int isnormal(double x)
+{
+   return fpclass(x) == FP_NORMAL;
+}
+#endif
+
+#if defined(_MSC_VER)
+static double copysign(double x, double y)
+{
+   return _copysign(x, y);
+}
+#endif
+
+/**
+ * Evaluate one component of packSnorm4x8.
+ */
+static uint8_t
+pack_snorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packSnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint8_t) (int8_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 127.0f);
+}
+
+/**
+ * Evaluate one component of packSnorm2x16.
+ */
+static uint16_t
+pack_snorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packSnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint16_t) (int16_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm4x8.
+ */
+static float
+unpack_snorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackSnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
+     */
+   return CLAMP((int8_t) u / 127.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm2x16.
+ */
+static float
+unpack_snorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackSnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm2x16: clamp(f / 32767.0, -1, +1)
+     */
+   return CLAMP((int16_t) u / 32767.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component packUnorm4x8.
+ */
+static uint8_t
+pack_unorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packUnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
+     */
+   return (uint8_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 255.0f);
+}
+
+/**
+ * Evaluate one component packUnorm2x16.
+ */
+static uint16_t
+pack_unorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packUnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
+     */
+   return (uint16_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 65535.0f);
+}
+
+/**
+ * Evaluate one component of unpackUnorm4x8.
+ */
+static float
+unpack_unorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackUnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm4x8: f / 255.0
+     */
+   return (float) u / 255.0f;
+}
+
+/**
+ * Evaluate one component of unpackUnorm2x16.
+ */
+static float
+unpack_unorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackUnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm2x16: f / 65535.0
+     */
+   return (float) u / 65535.0f;
+}
+
+/**
+ * Evaluate one component of packHalf2x16.
+ */
+static uint16_t
+pack_half_1x16(float x)
+{
+   return _mesa_float_to_half(x);
+}
+
+/**
+ * Evaluate one component of unpackHalf2x16.
+ */
+static float
+unpack_half_1x16(uint16_t u)
+{
+   return _mesa_half_to_float(u);
+}
+
+/* Some typed vector structures to make things like src0.y work */
+% for type in ["float", "int", "unsigned", "bool"]:
+struct ${type}_vec {
+   ${type} x;
+   ${type} y;
+   ${type} z;
+   ${type} w;
+};
+% endfor
+
+% for name, op in sorted(opcodes.iteritems()):
+static nir_const_value
+evaluate_${name}(unsigned num_components, nir_const_value *_src)
+{
+   nir_const_value _dst_val = { { {0, 0, 0, 0} } };
+
+   ## For each non-per-component input, create a variable srcN that
+   ## contains x, y, z, and w elements which are filled in with the
+   ## appropriately-typed values.
+   % for j in range(op.num_inputs):
+      % if op.input_sizes[j] == 0:
+         <% continue %>
+      % elif "src" + str(j) not in op.const_expr:
+         ## Avoid unused variable warnings
+         <% continue %>
+      %endif
+
+      struct ${op.input_types[j]}_vec src${j} = {
+      % for k in range(op.input_sizes[j]):
+         % if op.input_types[j] == "bool":
+            _src[${j}].u[${k}] != 0,
+         % else:
+            _src[${j}].${op.input_types[j][:1]}[${k}],
+         % endif
+      % endfor
+      };
+   % endfor
+
+   % if op.output_size == 0:
+      ## For per-component instructions, we need to iterate over the
+      ## components and apply the constant expression one component
+      ## at a time.
+      for (unsigned _i = 0; _i < num_components; _i++) {
+         ## For each per-component input, create a variable srcN that
+         ## contains the value of the current (_i'th) component.
+         % for j in range(op.num_inputs):
+            % if op.input_sizes[j] != 0:
+               <% continue %>
+            % elif "src" + str(j) not in op.const_expr:
+               ## Avoid unused variable warnings
+               <% continue %>
+            % elif op.input_types[j] == "bool":
+               bool src${j} = _src[${j}].u[_i] != 0;
+            % else:
+               ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i];
+            % endif
+         % endfor
+
+         ## Create an appropriately-typed variable dst and assign the
+         ## result of the const_expr to it.  If const_expr already contains
+         ## writes to dst, just include const_expr directly.
+         % if "dst" in op.const_expr:
+            ${op.output_type} dst;
+            ${op.const_expr}
+         % else:
+            ${op.output_type} dst = ${op.const_expr};
+         % endif
+
+         ## Store the current component of the actual destination to the
+         ## value of dst.
+         % if op.output_type == "bool":
+            ## Sanitize the C value to a proper NIR bool
+            _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
+         % else:
+            _dst_val.${op.output_type[:1]}[_i] = dst;
+         % endif
+      }
+   % else:
+      ## In the non-per-component case, create a struct dst with
+      ## appropriately-typed elements x, y, z, and w and assign the result
+      ## of the const_expr to all components of dst, or include the
+      ## const_expr directly if it writes to dst already.
+      struct ${op.output_type}_vec dst;
+
+      % if "dst" in op.const_expr:
+         ${op.const_expr}
+      % else:
+         ## Splat the value to all components.  This way expressions which
+         ## write the same value to all components don't need to explicitly
+         ## write to dest.  One such example is fnoise which has a
+         ## const_expr of 0.0f.
+         dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
+      % endif
+
+      ## For each component in the destination, copy the value of dst to
+      ## the actual destination.
+      % for k in range(op.output_size):
+         % if op.output_type == "bool":
+            ## Sanitize the C value to a proper NIR bool
+            _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
+         % else:
+            _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]};
+         % endif
+      % endfor
+   % endif
+
+   return _dst_val;
+}
+% endfor
+
+nir_const_value
+nir_eval_const_opcode(nir_op op, unsigned num_components,
+                      nir_const_value *src)
+{
+   switch (op) {
+% for name in sorted(opcodes.iterkeys()):
+   case nir_op_${name}: {
+      return evaluate_${name}(num_components, src);
+      break;
+   }
+% endfor
+   default:
+      unreachable("shouldn't get here");
+   }
+}"""
+
+from nir_opcodes import opcodes
+from mako.template import Template
+
+print Template(template).render(opcodes=opcodes)
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 5bafbb0229e..5fe957296eb 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -24,6 +24,7 @@
 # Authors:
 #    Connor Abbott (cwabbott0@gmail.com)
 
+
 # Class that represents all the information we have about the opcode
 # NOTE: this must be kept in sync with nir_op_info
 
@@ -32,7 +33,7 @@ class Opcode(object):
    NOTE: this must be kept in sync with nir_op_info
    """
    def __init__(self, name, output_size, output_type, input_sizes,
-                input_types, algebraic_properties):
+                input_types, algebraic_properties, const_expr):
       """Parameters:
 
       - name is the name of the opcode (prepend nir_op_ for the enum name)
@@ -40,6 +41,27 @@ class Opcode(object):
       - input_types is a list of types
       - algebraic_properties is a space-seperated string, where nir_op_is_ is
         prepended before each entry
+      - const_expr is an expression or series of statements that computes the
+        constant value of the opcode given the constant values of its inputs.
+
+      Constant expressions are formed from the variables src0, src1, ...,
+      src(N-1), where N is the number of arguments.  The output of the
+      expression should be stored in the dst variable.  Per-component input
+      and output variables will be scalars and non-per-component input and
+      output variables will be a struct with fields named x, y, z, and w
+      all of the correct type.  Input and output variables can be assumed
+      to already be of the correct type and need no conversion.  In
+      particular, the conversion from the C bool type to/from  NIR_TRUE and
+      NIR_FALSE happens automatically.
+
+      For per-component instructions, the entire expression will be
+      executed once for each component.  For non-per-component
+      instructions, the expression is expected to store the correct values
+      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
+      constant expression, an assignment to dst will happen automatically
+      and the result will be equivalent to "dst = <expression>" for
+      per-component instructions and "dst.x = dst.y = ... = <expression>"
+      for non-per-component instructions.
       """
       assert isinstance(name, str)
       assert isinstance(output_size, int)
@@ -49,6 +71,7 @@ class Opcode(object):
       assert isinstance(input_types, list)
       assert isinstance(input_types[0], str)
       assert isinstance(algebraic_properties, str)
+      assert isinstance(const_expr, str)
       assert len(input_sizes) == len(input_types)
       assert 0 <= output_size <= 4
       for size in input_sizes:
@@ -62,6 +85,7 @@ class Opcode(object):
       self.input_sizes = input_sizes
       self.input_types = input_types
       self.algebraic_properties = algebraic_properties
+      self.const_expr = const_expr
 
 # helper variables for strings
 tfloat = "float"
@@ -76,178 +100,289 @@ associative = "associative "
 opcodes = {}
 
 def opcode(name, output_size, output_type, input_sizes, input_types,
-           algebraic_properties):
+           algebraic_properties, const_expr):
    assert name not in opcodes
    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
-                          input_types, algebraic_properties)
-
-def unop_convert(name, in_type, out_type):
-   opcode(name, 0, out_type, [0], [in_type], "")
-
-def unop(name, ty):
-   opcode(name, 0, ty, [0], [ty], "")
-
-def unop_horiz(name, output_size, output_type, input_size, input_type):
-   opcode(name, output_size, output_type, [input_size], [input_type], "")
-
-def unop_reduce(name, output_size, output_type, input_type):
-   unop_horiz(name + "2", output_size, output_type, 2, input_type)
-   unop_horiz(name + "3", output_size, output_type, 3, input_type)
-   unop_horiz(name + "4", output_size, output_type, 4, input_type)
+                          input_types, algebraic_properties, const_expr)
+
+def unop_convert(name, in_type, out_type, const_expr):
+   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
+
+def unop(name, ty, const_expr):
+   opcode(name, 0, ty, [0], [ty], "", const_expr)
+
+def unop_horiz(name, output_size, output_type, input_size, input_type,
+               const_expr):
+   opcode(name, output_size, output_type, [input_size], [input_type], "",
+          const_expr)
+
+def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
+                reduce_expr, final_expr):
+   def prereduce(src):
+      return "(" + prereduce_expr.format(src=src) + ")"
+   def final(src):
+      return final_expr.format(src="(" + src + ")")
+   def reduce_(src0, src1):
+      return reduce_expr.format(src0=src0, src1=src1)
+   src0 = prereduce("src0.x")
+   src1 = prereduce("src0.y")
+   src2 = prereduce("src0.z")
+   src3 = prereduce("src0.w")
+   unop_horiz(name + "2", output_size, output_type, 2, input_type,
+              final(reduce_(src0, src1)))
+   unop_horiz(name + "3", output_size, output_type, 3, input_type,
+              final(reduce_(reduce_(src0, src1), src2)))
+   unop_horiz(name + "4", output_size, output_type, 4, input_type,
+              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 
 
 # These two move instructions differ in what modifiers they support and what
 # the negate modifier means. Otherwise, they are identical.
-unop("fmov", tfloat)
-unop("imov", tint)
-
-unop("ineg", tint)
-unop("fneg", tfloat)
-unop("inot", tint) # invert every bit of the integer
-unop("fnot", tfloat) # (src == 0.0) ? 1.0 : 0.0
-unop("fsign", tfloat)
-unop("isign", tint)
-unop("iabs", tint)
-unop("fabs", tfloat)
-unop("fsat", tfloat)
-unop("frcp", tfloat)
-unop("frsq", tfloat)
-unop("fsqrt", tfloat)
-unop("fexp", tfloat) # < e^x
-unop("flog", tfloat) # log base e
-unop("fexp2", tfloat)
-unop("flog2", tfloat)
-unop_convert("f2i", tfloat, tint) # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tunsigned) # Float-to-unsigned conversion
-unop_convert("i2f", tint, tfloat) # Integer-to-float conversion.
-unop_convert("f2b", tfloat, tbool) # Float-to-boolean conversion
-unop_convert("b2f", tbool, tfloat) # Boolean-to-float conversion
-unop_convert("i2b", tint, tbool) # int-to-boolean conversion
-unop_convert("b2i", tbool, tint) # Boolean-to-int conversion
-unop_convert("u2f", tunsigned, tfloat) #Unsigned-to-float conversion.
-
-unop_reduce("bany", 1, tbool, tbool) # returns ~0 if any component of src[0] != 0
-unop_reduce("ball", 1, tbool, tbool) # returns ~0 if all components of src[0] != 0
-unop_reduce("fany", 1, tfloat, tfloat) # returns 1.0 if any component of src[0] != 0
-unop_reduce("fall", 1, tfloat, tfloat) # returns 1.0 if all components of src[0] != 0
+unop("fmov", tfloat, "src0")
+unop("imov", tint, "src0")
+
+unop("ineg", tint, "-src0")
+unop("fneg", tfloat, "-src0")
+unop("inot", tint, "~src0") # invert every bit of the integer
+unop("fnot", tfloat, "(src0 == 0.0f) ? 1.0f : 0.0f")
+unop("fsign", tfloat, "(src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f)")
+unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
+unop("iabs", tint, "abs(src0)")
+unop("fabs", tfloat, "fabsf(src0)")
+unop("fsat", tfloat, "(src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0)")
+unop("frcp", tfloat, "1.0f / src0")
+unop("frsq", tfloat, "1.0f / sqrtf(src0)")
+unop("fsqrt", tfloat, "sqrtf(src0)")
+unop("fexp", tfloat, "expf(src0)") # < e^x
+unop("flog", tfloat, "logf(src0)") # log base e
+unop("fexp2", tfloat, "exp2f(src0)")
+unop("flog2", tfloat, "log2f(src0)")
+unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tfloat, tunsigned, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion.
+# Float-to-boolean conversion
+unop_convert("f2b", tfloat, tbool, "src0 == 0.0f")
+# Boolean-to-float conversion
+unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f")
+# Int-to-boolean conversion
+unop_convert("i2b", tint, tbool, "src0 == 0")
+unop_convert("b2i", tbool, tint, "src0 ? 0 : -1") # Boolean-to-int conversion
+unop_convert("u2f", tunsigned, tfloat, "src0") #Unsigned-to-float conversion.
+
+unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}")
+unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}")
+unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} || {src1}",
+            "{src} ? 1.0f : 0.0f")
+unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} && {src1}",
+            "{src} ? 1.0f : 0.0f")
 
 # Unary floating-point rounding operations.
 
 
-unop("ftrunc", tfloat)
-unop("fceil", tfloat)
-unop("ffloor", tfloat)
-unop("ffract", tfloat)
-unop("fround_even", tfloat)
+unop("ftrunc", tfloat, "truncf(src0)")
+unop("fceil", tfloat, "ceilf(src0)")
+unop("ffloor", tfloat, "floorf(src0)")
+unop("ffract", tfloat, "src0 - floorf(src0)")
+unop("fround_even", tfloat, "_mesa_round_to_even(src0)")
 
 
 # Trigonometric operations.
 
 
-unop("fsin", tfloat)
-unop("fcos", tfloat)
-unop("fsin_reduced", tfloat)
-unop("fcos_reduced", tfloat)
+unop("fsin", tfloat, "sinf(src0)")
+unop("fcos", tfloat, "cosf(src0)")
+unop("fsin_reduced", tfloat, "sinf(src0)")
+unop("fcos_reduced", tfloat, "cosf(src0)")
 
 
 # Partial derivatives.
 
 
-unop("fddx", tfloat)
-unop("fddy", tfloat)
-unop("fddx_fine", tfloat)
-unop("fddy_fine", tfloat)
-unop("fddx_coarse", tfloat)
-unop("fddy_coarse", tfloat)
+unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
+unop("fddy", tfloat, "0.0f")
+unop("fddx_fine", tfloat, "0.0f")
+unop("fddy_fine", tfloat, "0.0f")
+unop("fddx_coarse", tfloat, "0.0f")
+unop("fddy_coarse", tfloat, "0.0f")
 
 
 # Floating point pack and unpack operations.
 
-
-unop_horiz("pack_snorm_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("pack_snorm_4x8", 1, tunsigned, 4, tfloat)
-unop_horiz("pack_unorm_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("pack_unorm_4x8", 1, tunsigned, 4, tfloat)
-unop_horiz("pack_half_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("unpack_snorm_2x16", 2, tfloat, 1, tunsigned)
-unop_horiz("unpack_snorm_4x8", 4, tfloat, 1, tunsigned)
-unop_horiz("unpack_unorm_2x16", 2, tfloat, 1, tunsigned)
-unop_horiz("unpack_unorm_4x8", 4, tfloat, 1, tunsigned)
-unop_horiz("unpack_half_2x16", 2, tfloat, 1, tunsigned)
+def pack_2x16(fmt):
+   unop_horiz("pack_" + fmt + "_2x16", 1, tunsigned, 2, tfloat, """
+dst.x = (uint32_t) pack_fmt_1x16(src0.x);
+dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
+""".replace("fmt", fmt))
+
+def pack_4x8(fmt):
+   unop_horiz("pack_" + fmt + "_4x8", 1, tunsigned, 4, tfloat, """
+dst.x = (uint32_t) pack_fmt_1x8(src0.x);
+dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
+dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
+dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
+""".replace("fmt", fmt))
+
+def unpack_2x16(fmt):
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tunsigned, """
+dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
+dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
+""".replace("fmt", fmt))
+
+def unpack_4x8(fmt):
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tunsigned, """
+dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
+dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
+dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
+dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
+""".replace("fmt", fmt))
+
+
+pack_2x16("snorm")
+pack_4x8("snorm")
+pack_2x16("unorm")
+pack_4x8("unorm")
+pack_2x16("half")
+unpack_2x16("snorm")
+unpack_4x8("snorm")
+unpack_2x16("unorm")
+unpack_4x8("unorm")
+unpack_2x16("half")
 
 
 # Lowered floating point unpacking operations.
 
 
-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned)
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned)
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned, """
+dst.x = unpack_half_1x16((uint16_t)(src0.x & 0xffff));
+""")
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned, """
+dst.y = unpack_half_1x16((uint16_t)(src0.x >> 16));
+""")
 
 
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tunsigned)
-unop("bit_count", tunsigned)
-unop_convert("ufind_msb", tunsigned, tint)
-unop("ifind_msb", tint)
-unop("find_lsb", tint)
+unop("bitfield_reverse", tunsigned, """
+/* we're not winning any awards for speed here, but that's ok */
+dst = 0;
+for (unsigned bit = 0; bit < 32; bit++)
+   dst |= ((src0 >> bit) & 1) << (31 - bit);
+""")
+unop("bit_count", tunsigned, """
+dst = 0;
+for (unsigned bit = 0; bit < 32; bit++) {
+   if ((src0 >> bit) & 1)
+      dst++;
+}
+""")
+
+unop_convert("ufind_msb", tunsigned, tint, """
+dst = -1;
+for (int bit = 31; bit > 0; bit--) {
+   if ((src0 >> bit) & 1) {
+      dst = bit;
+      break;
+   }
+}
+""")
+
+unop("ifind_msb", tint, """
+dst = -1;
+for (int bit = 31; bit >= 0; bit--) {
+   /* If src0 < 0, we're looking for the first 0 bit.
+    * if src0 >= 0, we're looking for the first 1 bit.
+    */
+   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
+      (!((src0 >> bit) & 1) && (src0 < 0))) {
+      dst = bit;
+      break;
+   }
+}
+""")
+
+unop("find_lsb", tint, """
+dst = -1;
+for (unsigned bit = 0; bit < 32; bit++) {
+   if ((src0 >> bit) & 1) {
+      dst = bit;
+      break;
+   }
+}
+""")
 
 
 for i in xrange(1, 5):
    for j in xrange(1, 5):
-      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat)
+      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 
-def binop_convert(name, out_type, in_type, alg_props):
-   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props)
+def binop_convert(name, out_type, in_type, alg_props, const_expr):
+   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 
-def binop(name, ty, alg_props):
-   binop_convert(name, ty, ty, alg_props)
+def binop(name, ty, alg_props, const_expr):
+   binop_convert(name, ty, ty, alg_props, const_expr)
 
-def binop_compare(name, ty, alg_props):
-   binop_convert(name, ty, tbool, alg_props)
+def binop_compare(name, ty, alg_props, const_expr):
+   binop_convert(name, tbool, ty, alg_props, const_expr)
 
 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
-                src2_type):
-   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], "")
-
-def binop_reduce(name, output_size, output_type, src_type):
-   opcode(name + "2",output_size, output_type,
-          [2, 2], [src_type, src_type], commutative)
+                src2_type, const_expr):
+   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
+          "", const_expr)
+
+def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
+                 reduce_expr, final_expr):
+   def final(src):
+      return final_expr.format(src= "(" + src + ")")
+   def reduce_(src0, src1):
+      return reduce_expr.format(src0=src0, src1=src1)
+   def prereduce(src0, src1):
+      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
+   src0 = prereduce("src0.x", "src1.x")
+   src1 = prereduce("src0.y", "src1.y")
+   src2 = prereduce("src0.z", "src1.z")
+   src3 = prereduce("src0.w", "src1.w")
+   opcode(name + "2", output_size, output_type,
+          [2, 2], [src_type, src_type], commutative,
+          final(reduce_(src0, src1)))
    opcode(name + "3", output_size, output_type,
-          [3, 3], [src_type, src_type], commutative)
+          [3, 3], [src_type, src_type], commutative,
+          final(reduce_(reduce_(src0, src1), src2)))
    opcode(name + "4", output_size, output_type,
-          [4, 4], [src_type, src_type], commutative)
+          [4, 4], [src_type, src_type], commutative,
+          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 
-binop("fadd", tfloat, commutative + associative)
-binop("iadd", tint, commutative + associative)
-binop("fsub", tfloat, "")
-binop("isub", tint, "")
+binop("fadd", tfloat, commutative + associative, "src0 + src1")
+binop("iadd", tint, commutative + associative, "src0 + src1")
+binop("fsub", tfloat, "", "src0 - src1")
+binop("isub", tint, "", "src0 - src1")
 
-binop("fmul", tfloat, commutative + associative)
+binop("fmul", tfloat, commutative + associative, "src0 * src1")
 # low 32-bits of signed/unsigned integer multiply
-binop("imul", tint, commutative + associative)
+binop("imul", tint, commutative + associative, "src0 * src1")
 # high 32-bits of signed integer multiply
-binop("imul_high", tint, commutative)
+binop("imul_high", tint, commutative,
+      "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tunsigned, commutative)
+binop("umul_high", tunsigned, commutative,
+      "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 
-binop("fdiv", tfloat, "")
-binop("idiv", tint, "")
-binop("udiv", tunsigned, "")
+binop("fdiv", tfloat, "", "src0 / src1")
+binop("idiv", tint, "", "src0 / src1")
+binop("udiv", tunsigned, "", "src0 / src1")
 
 # returns a boolean representing the carry resulting from the addition of
 # the two unsigned arguments.
 
-binop_convert("uadd_carry", tbool, tunsigned,
-              commutative)
+binop_convert("uadd_carry", tbool, tunsigned, commutative, "src0 + src1 < src0")
 
 # returns a boolean representing the borrow resulting from the subtraction
 # of the two unsigned arguments.
 
-binop_convert("usub_borrow", tbool, tunsigned, "")
+binop_convert("usub_borrow", tbool, tunsigned, "", "src1 < src0")
 
-binop("fmod", tfloat, "")
-binop("umod", tunsigned, "")
+binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
+binop("umod", tunsigned, "", "src1 == 0 ? 0 : src0 % src1")
 
 #
 # Comparisons
@@ -256,41 +391,47 @@ binop("umod", tunsigned, "")
 
 # these integer-aware comparisons return a boolean (0 or ~0)
 
-binop_compare("flt", tfloat, "")
-binop_compare("fge", tfloat, "")
-binop_compare("feq", tfloat, commutative)
-binop_compare("fne", tfloat, commutative)
-binop_compare("ilt", tint, "")
-binop_compare("ige", tint, "")
-binop_compare("ieq", tint, commutative)
-binop_compare("ine", tint, commutative)
-binop_compare("ult", tunsigned, "")
-binop_compare("uge", tunsigned, "")
+binop_compare("flt", tfloat, "", "src0 < src1")
+binop_compare("fge", tfloat, "", "src0 >= src1")
+binop_compare("feq", tfloat, commutative, "src0 == src1")
+binop_compare("fne", tfloat, commutative, "src0 != src1")
+binop_compare("ilt", tint, "", "src0 < src1")
+binop_compare("ige", tint, "", "src0 >= src1")
+binop_compare("ieq", tint, commutative, "src0 == src1")
+binop_compare("ine", tint, commutative, "src0 != src1")
+binop_compare("ult", tunsigned, "", "src0 < src1")
+binop_compare("uge", tunsigned, "", "src0 >= src1")
 
 # integer-aware GLSL-style comparisons that compare floats and ints
 
-binop_reduce("ball_fequal",  1, tbool, tfloat)
-binop_reduce("bany_fnequal", 1, tbool, tfloat)
-binop_reduce("ball_iequal",  1, tbool, tint)
-binop_reduce("bany_inequal", 1, tbool, tint)
+binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
+             "{src0} && {src1}", "{src}")
+binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
+             "{src0} || {src1}", "{src}")
+binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
+             "{src0} && {src1}", "{src}")
+binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
+             "{src0} || {src1}", "{src}")
 
 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 
-binop_reduce("fall_equal",  1, tfloat, tfloat)
-binop_reduce("fany_nequal", 1, tfloat, tfloat)
+binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
+             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
+binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
+             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 
 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 # and false respectively
 
-binop("slt", tfloat, "") # Set on Less Than
-binop("sge", tfloat, "") # Set on Greater Than or Equal
-binop("seq", tfloat, commutative) # Set on Equal
-binop("sne", tfloat, commutative) # Set on Not Equal
+binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
+binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
+binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 
 
-binop("ishl", tint, "")
-binop("ishr", tint, "")
-binop("ushr", tunsigned, "")
+binop("ishl", tint, "", "src0 << src1")
+binop("ishr", tint, "", "src0 >> src1")
+binop("ushr", tunsigned, "", "src0 >> src1")
 
 # bitwise logic operators
 #
@@ -298,9 +439,9 @@ binop("ushr", tunsigned, "")
 # integers.
 
 
-binop("iand", tunsigned, commutative + associative)
-binop("ior", tunsigned, commutative + associative)
-binop("ixor", tunsigned, commutative + associative)
+binop("iand", tunsigned, commutative + associative, "src0 & src1")
+binop("ior", tunsigned, commutative + associative, "src0 | src1")
+binop("ixor", tunsigned, commutative + associative, "src0 ^ src1")
 
 
 # floating point logic operators
@@ -308,42 +449,60 @@ binop("ixor", tunsigned, commutative + associative)
 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 # for true and 0.0 for false
 
-binop("fand", tfloat, commutative)
-binop("for", tfloat, commutative)
-binop("fxor", tfloat, commutative)
-
-binop_reduce("fdot", 1, tfloat, tfloat)
-
-binop("fmin", tfloat, "")
-binop("imin", tint, commutative + associative)
-binop("umin", tunsigned, commutative + associative)
-binop("fmax", tfloat, "")
-binop("imax", tint, commutative + associative)
-binop("umax", tunsigned, commutative + associative)
-
-binop("fpow", tfloat, "")
-
-binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat)
-
-binop("bfm", tunsigned, "")
-
-binop("ldexp", tunsigned, "")
+binop("fand", tfloat, commutative,
+      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
+binop("for", tfloat, commutative,
+      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
+binop("fxor", tfloat, commutative,
+      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
+
+binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
+             "{src}")
+
+binop("fmin", tfloat, "", "fminf(src0, src1)")
+binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
+binop("umin", tunsigned, commutative + associative, "src1 > src0 ? src0 : src1")
+binop("fmax", tfloat, "", "fmaxf(src0, src1)")
+binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
+binop("umax", tunsigned, commutative + associative, "src1 > src0 ? src1 : src0")
+
+binop("fpow", tfloat, "", "powf(src0, src1)")
+
+binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat,
+            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
+
+binop_convert("bfm", tunsigned, tint, "", """
+int offset = src0, bits = src1;
+if (offset < 0 || bits < 0 || offset + bits > 32)
+   dst = 0; /* undefined per the spec */
+else
+   dst = ((1 << bits)- 1) << offset;
+""")
+
+opcode("ldexp", 0, tunsigned, [0, 0], [tfloat, tint], "", """
+dst = ldexp(src0, src1);
+/* flush denormals to zero. */
+if (!isnormal(dst))
+   dst = copysign(0.0f, src0);
+""")
 
 # Combines the first component of each input to make a 2-component vector.
 
-binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned)
+binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned, """
+dst.x = src0.x;
+dst.y = src1.x;
+""")
 
-def triop(name, ty):
-   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "")
-def triop_horiz(name, output_size, src1_size, src2_size, src3_size):
+def triop(name, ty, const_expr):
+   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
+def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
    opcode(name, output_size, tunsigned,
    [src1_size, src2_size, src3_size],
-   [tunsigned, tunsigned, tunsigned], "")
+   [tunsigned, tunsigned, tunsigned], "", const_expr)
 
-# fma(a, b, c) = (a# b) + c
-triop("ffma", tfloat)
+triop("ffma", tfloat, "src0 * src1 + src2")
 
-triop("flrp", tfloat)
+triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 
 # Conditional Select
 #
@@ -352,30 +511,83 @@ triop("flrp", tfloat)
 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 
 
-triop("fcsel", tfloat)
+triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
 opcode("bcsel", 0, tunsigned, [0, 0, 0],
-       [tbool, tunsigned, tunsigned], "")
-
-triop("bfi", tunsigned)
-
-triop("ubitfield_extract", tunsigned)
-opcode("ibitfield_extract", 0, tint, [0, 0, 0],
-       [tint, tunsigned, tunsigned], "")
+      [tbool, tunsigned, tunsigned], "", "src0 ? src1 : src2")
+
+triop("bfi", tunsigned, """
+unsigned mask = src0, insert = src1 & mask, base = src2;
+if (mask == 0) {
+   dst = base;
+} else {
+   unsigned tmp = mask;
+   while (!(tmp & 1)) {
+      tmp >>= 1;
+      insert <<= 1;
+   }
+   dst = (base & ~mask) | insert;
+}
+""")
+
+opcode("ubitfield_extract", 0, tunsigned,
+       [0, 1, 1], [tunsigned, tint, tint], "", """
+unsigned base = src0;
+int offset = src1.x, bits = src2.x;
+if (bits == 0) {
+   dst = 0;
+} else if (bits < 0 || offset < 0 || offset + bits > 32) {
+   dst = 0; /* undefined per the spec */
+} else {
+   dst = (base >> offset) & ((1 << bits) - 1);
+}
+""")
+opcode("ibitfield_extract", 0, tint,
+       [0, 1, 1], [tint, tint, tint], "", """
+int base = src0;
+int offset = src1.x, bits = src2.x;
+if (bits == 0) {
+   dst = 0;
+} else if (offset < 0 || bits < 0 || offset + bits > 32) {
+   dst = 0;
+} else {
+   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
+}
+""")
 
 # Combines the first component of each input to make a 3-component vector.
 
-triop_horiz("vec3", 3, 1, 1, 1)
+triop_horiz("vec3", 3, 1, 1, 1, """
+dst.x = src0.x;
+dst.y = src1.x;
+dst.z = src2.x;
+""")
 
-def quadop(name):
-   opcode(name, 0, tunsigned, [0, 0, 0, 0],
-          [tunsigned, tunsigned, tunsigned, tunsigned],
-          "")
-def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, src4_size):
+def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
+                 src4_size, const_expr):
    opcode(name, output_size, tunsigned,
           [src1_size, src2_size, src3_size, src4_size],
           [tunsigned, tunsigned, tunsigned, tunsigned],
-          "")
+          "", const_expr)
+
+opcode("bitfield_insert", 0, tunsigned, [0, 0, 1, 1],
+       [tunsigned, tunsigned, tint, tint], "", """
+unsigned base = src0, insert = src1;
+int offset = src2.x, bits = src3.x;
+if (bits == 0) {
+   dst = 0;
+} else if (offset < 0 || bits < 0 || bits + offset > 32) {
+   dst = 0;
+} else {
+   unsigned mask = ((1 << bits) - 1) << offset;
+   dst = (base & ~mask) | ((insert << bits) & mask);
+}
+""")
+
+quadop_horiz("vec4", 4, 1, 1, 1, 1, """
+dst.x = src0.x;
+dst.y = src1.x;
+dst.z = src2.x;
+dst.w = src3.x;
+""")
 
-quadop("bitfield_insert")
 
-quadop_horiz("vec4", 4, 1, 1, 1, 1)
-- 
2.30.2