nir: Add new system values and intrinsics for dealing with CL work offsets

[mesa.git] / src / compiler / nir / nir_serialize.c
diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c

index c080d30a74f619151e87607dc563d4d387dfe603..626b7d44a021ddaff019c1aa1a30e0528b1c1d75 100644 (file)
--- a/src/compiler/nir/nir_serialize.c
+++ b/src/compiler/nir/nir_serialize.c
@@ -56,6 +56,10 @@ typedef struct {
     const struct glsl_type *last_interface_type;
     struct nir_variable_data last_var_data;
  
+   /* For skipping equal ALU headers (typical after scalarization). */
+   nir_instr_type last_instr_type;
+   uintptr_t last_alu_header_offset;
+
     /* Don't write optional data such as variable names. */
     bool strip;
  } write_ctx;
@@ -137,6 +141,8 @@ decode_bit_size_3bits(uint8_t bit_size)
     return 0;
  }
  
+#define NUM_COMPONENTS_IS_SEPARATE_7   7
+
  static uint8_t
  encode_num_components_in_3bits(uint8_t num_components)
  {
@@ -147,8 +153,8 @@ encode_num_components_in_3bits(uint8_t num_components)
     if (num_components == 16)
        return 6;
  
-   unreachable("invalid number in num_components");
-   return 0;
+   /* special value indicating that num_components is in the next uint32 */
+   return NUM_COMPONENTS_IS_SEPARATE_7;
  }
  
  static uint8_t
@@ -200,12 +206,13 @@ union packed_var {
     struct {
        unsigned has_name:1;
        unsigned has_constant_initializer:1;
+      unsigned has_pointer_initializer:1;
        unsigned has_interface_type:1;
        unsigned num_state_slots:7;
        unsigned data_encoding:2;
        unsigned type_same_as_last:1;
        unsigned interface_type_same_as_last:1;
-      unsigned _pad:2;
+      unsigned _pad:1;
        unsigned num_members:16;
     } u;
  };
@@ -225,7 +232,6 @@ write_variable(write_ctx *ctx, const nir_variable *var)
     write_add_object(ctx, var);
  
     assert(var->num_state_slots < (1 << 7));
-   assert(var->num_members < (1 << 16));
  
     STATIC_ASSERT(sizeof(union packed_var) == 4);
     union packed_var flags;
@@ -233,6 +239,7 @@ write_variable(write_ctx *ctx, const nir_variable *var)
  
     flags.u.has_name = !ctx->strip && var->name;
     flags.u.has_constant_initializer = !!(var->constant_initializer);
+   flags.u.has_pointer_initializer = !!(var->pointer_initializer);
     flags.u.has_interface_type = !!(var->interface_type);
     flags.u.type_same_as_last = var->type == ctx->last_type;
     flags.u.interface_type_same_as_last =
@@ -317,6 +324,8 @@ write_variable(write_ctx *ctx, const nir_variable *var)
     }
     if (var->constant_initializer)
        write_constant(ctx, var->constant_initializer);
+   if (var->pointer_initializer)
+      write_lookup_object(ctx, var->pointer_initializer);
     if (var->num_members > 0) {
        blob_write_bytes(ctx->blob, (uint8_t *) var->members,
                         var->num_members * sizeof(*var->members));
@@ -387,6 +396,12 @@ read_variable(read_ctx *ctx)
        var->constant_initializer = read_constant(ctx, var);
     else
        var->constant_initializer = NULL;
+
+   if (flags.u.has_pointer_initializer)
+      var->pointer_initializer = read_object(ctx);
+   else
+      var->pointer_initializer = NULL;
+
     var->num_members = flags.u.num_members;
     if (var->num_members > 0) {
        var->members = ralloc_array(var, struct nir_variable_data,
@@ -569,14 +584,14 @@ union packed_dest {
  };
  
  enum intrinsic_const_indices_encoding {
-   /* Use the 6 bits of packed_const_indices to store 1-6 indices.
-    * 1 6-bit index, or 2 3-bit indices, or 3 2-bit indices, or
-    * 4-6 1-bit indices.
+   /* Use the 9 bits of packed_const_indices to store 1-9 indices.
+    * 1 9-bit index, or 2 4-bit indices, or 3 3-bit indices, or
+    * 4 2-bit indices, or 5-9 1-bit indices.
      *
      * The common case for load_ubo is 0, 0, 0, which is trivially represented.
      * The common cases for load_interpolated_input also fit here, e.g.: 7, 3
      */
-   const_indices_6bit_all_combined,
+   const_indices_9bit_all_combined,
  
     const_indices_8bit,  /* 8 bits per element */
     const_indices_16bit, /* 16 bits per element */
@@ -609,10 +624,12 @@ union packed_instr {
        unsigned no_signed_wrap:1;
        unsigned no_unsigned_wrap:1;
        unsigned saturate:1;
-      unsigned writemask:4;
+      /* Reg: writemask; SSA: swizzles for 2 srcs */
+      unsigned writemask_or_two_swizzles:4;
        unsigned op:9;
        unsigned packed_src_ssa_16bit:1;
-      unsigned _pad:2;
+      /* Scalarized ALUs always have the same header. */
+      unsigned num_followup_alu_sharing_header:2;
        unsigned dest:8;
     } alu;
     struct {
@@ -620,7 +637,8 @@ union packed_instr {
        unsigned deref_type:3;
        unsigned cast_type_same_as_last:1;
        unsigned mode:10; /* deref_var redefines this */
-      unsigned _pad:6;  /* deref_var redefines this */
+      unsigned packed_src_ssa_16bit:1; /* deref_var redefines this */
+      unsigned _pad:5;  /* deref_var redefines this */
        unsigned dest:8;
     } deref;
     struct {
@@ -633,9 +651,8 @@ union packed_instr {
     struct {
        unsigned instr_type:4;
        unsigned intrinsic:9;
-      unsigned num_components:3;
        unsigned const_indices_encoding:2;
-      unsigned packed_const_indices:6;
+      unsigned packed_const_indices:9;
        unsigned dest:8;
     } intrinsic;
     struct {
@@ -655,8 +672,8 @@ union packed_instr {
        unsigned instr_type:4;
        unsigned num_srcs:4;
        unsigned op:4;
-      unsigned texture_array_size:12;
        unsigned dest:8;
+      unsigned _pad:12;
     } tex;
     struct {
        unsigned instr_type:4;
@@ -672,7 +689,8 @@ union packed_instr {
  
  /* Write "lo24" as low 24 bits in the first uint32. */
  static void
-write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header)
+write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header,
+           nir_instr_type instr_type)
  {
     STATIC_ASSERT(sizeof(union packed_dest) == 1);
     union packed_dest dest;
@@ -687,9 +705,47 @@ write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header)
     } else {
        dest.reg.is_indirect = !!(dst->reg.indirect);
     }
-
     header.any.dest = dest.u8;
-   blob_write_uint32(ctx->blob, header.u32);
+
+   /* Check if the current ALU instruction has the same header as the previous
+    * instruction that is also ALU. If it is, we don't have to write
+    * the current header. This is a typical occurence after scalarization.
+    */
+   if (instr_type == nir_instr_type_alu) {
+      bool equal_header = false;
+
+      if (ctx->last_instr_type == nir_instr_type_alu) {
+         assert(ctx->last_alu_header_offset);
+         union packed_instr *last_header =
+            (union packed_instr *)(ctx->blob->data +
+                                   ctx->last_alu_header_offset);
+
+         /* Clear the field that counts ALUs with equal headers. */
+         union packed_instr clean_header;
+         clean_header.u32 = last_header->u32;
+         clean_header.alu.num_followup_alu_sharing_header = 0;
+
+         /* There can be at most 4 consecutive ALU instructions
+          * sharing the same header.
+          */
+         if (last_header->alu.num_followup_alu_sharing_header < 3 &&
+             header.u32 == clean_header.u32) {
+            last_header->alu.num_followup_alu_sharing_header++;
+            equal_header = true;
+         }
+      }
+
+      if (!equal_header) {
+         ctx->last_alu_header_offset = ctx->blob->size;
+         blob_write_uint32(ctx->blob, header.u32);
+      }
+   } else {
+      blob_write_uint32(ctx->blob, header.u32);
+   }
+
+   if (dest.ssa.is_ssa &&
+       dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7)
+      blob_write_uint32(ctx->blob, dst->ssa.num_components);
  
     if (dst->is_ssa) {
        write_add_object(ctx, &dst->ssa);
@@ -712,8 +768,11 @@ read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr,
  
     if (dest.ssa.is_ssa) {
        unsigned bit_size = decode_bit_size_3bits(dest.ssa.bit_size);
-      unsigned num_components =
-         decode_num_components_in_3bits(dest.ssa.num_components);
+      unsigned num_components;
+      if (dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7)
+         num_components = blob_read_uint32(ctx->blob);
+      else
+         num_components = decode_num_components_in_3bits(dest.ssa.num_components);
        char *name = dest.ssa.has_name ? blob_read_string(ctx->blob) : NULL;
        nir_ssa_dest_init(instr, dst, num_components, bit_size, name);
        read_add_object(ctx, &dst->ssa);
@@ -746,6 +805,13 @@ is_alu_src_ssa_16bit(write_ctx *ctx, const nir_alu_instr *alu)
        unsigned src_components = nir_ssa_alu_instr_src_components(alu, i);
  
        for (unsigned chan = 0; chan < src_components; chan++) {
+         /* The swizzles for src0.x and src1.x are stored
+          * in writemask_or_two_swizzles for SSA ALUs.
+          */
+         if (alu->dest.dest.is_ssa && i < 2 && chan == 0 &&
+             alu->src[i].swizzle[chan] < 4)
+            continue;
+
           if (alu->src[i].swizzle[chan] != chan)
              return false;
        }
@@ -758,6 +824,8 @@ static void
  write_alu(write_ctx *ctx, const nir_alu_instr *alu)
  {
     unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
+   unsigned dst_components = nir_dest_num_components(alu->dest.dest);
+
     /* 9 bits for nir_op */
     STATIC_ASSERT(nir_num_opcodes <= 512);
     union packed_instr header;
@@ -768,11 +836,24 @@ write_alu(write_ctx *ctx, const nir_alu_instr *alu)
     header.alu.no_signed_wrap = alu->no_signed_wrap;
     header.alu.no_unsigned_wrap = alu->no_unsigned_wrap;
     header.alu.saturate = alu->dest.saturate;
-   header.alu.writemask = alu->dest.write_mask;
     header.alu.op = alu->op;
     header.alu.packed_src_ssa_16bit = is_alu_src_ssa_16bit(ctx, alu);
  
-   write_dest(ctx, &alu->dest.dest, header);
+   if (header.alu.packed_src_ssa_16bit &&
+       alu->dest.dest.is_ssa) {
+      /* For packed srcs of SSA ALUs, this field stores the swizzles. */
+      header.alu.writemask_or_two_swizzles = alu->src[0].swizzle[0];
+      if (num_srcs > 1)
+         header.alu.writemask_or_two_swizzles |= alu->src[1].swizzle[0] << 2;
+   } else if (!alu->dest.dest.is_ssa && dst_components <= 4) {
+      /* For vec4 registers, this field is a writemask. */
+      header.alu.writemask_or_two_swizzles = alu->dest.write_mask;
+   }
+
+   write_dest(ctx, &alu->dest.dest, header, alu->instr.type);
+
+   if (!alu->dest.dest.is_ssa && dst_components > 4)
+      blob_write_uint32(ctx->blob, alu->dest.write_mask);
  
     if (header.alu.packed_src_ssa_16bit) {
        for (unsigned i = 0; i < num_srcs; i++) {
@@ -783,17 +864,37 @@ write_alu(write_ctx *ctx, const nir_alu_instr *alu)
        }
     } else {
        for (unsigned i = 0; i < num_srcs; i++) {
+         unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i);
+         unsigned src_components = nir_src_num_components(alu->src[i].src);
           union packed_src src;
+         bool packed = src_components <= 4 && src_channels <= 4;
           src.u32 = 0;
  
           src.alu.negate = alu->src[i].negate;
           src.alu.abs = alu->src[i].abs;
-         src.alu.swizzle_x = alu->src[i].swizzle[0];
-         src.alu.swizzle_y = alu->src[i].swizzle[1];
-         src.alu.swizzle_z = alu->src[i].swizzle[2];
-         src.alu.swizzle_w = alu->src[i].swizzle[3];
+
+         if (packed) {
+            src.alu.swizzle_x = alu->src[i].swizzle[0];
+            src.alu.swizzle_y = alu->src[i].swizzle[1];
+            src.alu.swizzle_z = alu->src[i].swizzle[2];
+            src.alu.swizzle_w = alu->src[i].swizzle[3];
+         }
  
           write_src_full(ctx, &alu->src[i].src, src);
+
+         /* Store swizzles for vec8 and vec16. */
+         if (!packed) {
+            for (unsigned o = 0; o < src_channels; o += 8) {
+               unsigned value = 0;
+
+               for (unsigned j = 0; j < 8 && o + j < src_channels; j++) {
+                  value |= (uint32_t)alu->src[i].swizzle[o + j] <<
+                           (4 * j); /* 4 bits per swizzle */
+               }
+
+               blob_write_uint32(ctx->blob, value);
+            }
+         }
        }
     }
  }
@@ -808,10 +909,19 @@ read_alu(read_ctx *ctx, union packed_instr header)
     alu->no_signed_wrap = header.alu.no_signed_wrap;
     alu->no_unsigned_wrap = header.alu.no_unsigned_wrap;
     alu->dest.saturate = header.alu.saturate;
-   alu->dest.write_mask = header.alu.writemask;
  
     read_dest(ctx, &alu->dest.dest, &alu->instr, header);
  
+   unsigned dst_components = nir_dest_num_components(alu->dest.dest);
+
+   if (alu->dest.dest.is_ssa) {
+      alu->dest.write_mask = u_bit_consecutive(0, dst_components);
+   } else if (dst_components <= 4) {
+      alu->dest.write_mask = header.alu.writemask_or_two_swizzles;
+   } else {
+      alu->dest.write_mask = blob_read_uint32(ctx->blob);
+   }
+
     if (header.alu.packed_src_ssa_16bit) {
        for (unsigned i = 0; i < num_srcs; i++) {
           nir_alu_src *src = &alu->src[i];
@@ -828,16 +938,41 @@ read_alu(read_ctx *ctx, union packed_instr header)
     } else {
        for (unsigned i = 0; i < num_srcs; i++) {
           union packed_src src = read_src(ctx, &alu->src[i].src, &alu->instr);
+         unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i);
+         unsigned src_components = nir_src_num_components(alu->src[i].src);
+         bool packed = src_components <= 4 && src_channels <= 4;
  
           alu->src[i].negate = src.alu.negate;
           alu->src[i].abs = src.alu.abs;
-         alu->src[i].swizzle[0] = src.alu.swizzle_x;
-         alu->src[i].swizzle[1] = src.alu.swizzle_y;
-         alu->src[i].swizzle[2] = src.alu.swizzle_z;
-         alu->src[i].swizzle[3] = src.alu.swizzle_w;
+
+         memset(&alu->src[i].swizzle, 0, sizeof(alu->src[i].swizzle));
+
+         if (packed) {
+            alu->src[i].swizzle[0] = src.alu.swizzle_x;
+            alu->src[i].swizzle[1] = src.alu.swizzle_y;
+            alu->src[i].swizzle[2] = src.alu.swizzle_z;
+            alu->src[i].swizzle[3] = src.alu.swizzle_w;
+         } else {
+            /* Load swizzles for vec8 and vec16. */
+            for (unsigned o = 0; o < src_channels; o += 8) {
+               unsigned value = blob_read_uint32(ctx->blob);
+
+               for (unsigned j = 0; j < 8 && o + j < src_channels; j++) {
+                  alu->src[i].swizzle[o + j] =
+                     (value >> (4 * j)) & 0xf; /* 4 bits per swizzle */
+               }
+            }
+         }
        }
     }
  
+   if (header.alu.packed_src_ssa_16bit &&
+       alu->dest.dest.is_ssa) {
+      alu->src[0].swizzle[0] = header.alu.writemask_or_two_swizzles & 0x3;
+      if (num_srcs > 1)
+         alu->src[1].swizzle[0] = header.alu.writemask_or_two_swizzles >> 2;
+   }
+
     return alu;
  }
  
@@ -865,7 +1000,14 @@ write_deref(write_ctx *ctx, const nir_deref_instr *deref)
           header.deref_var.object_idx = var_idx;
     }
  
-   write_dest(ctx, &deref->dest, header);
+   if (deref->deref_type == nir_deref_type_array ||
+       deref->deref_type == nir_deref_type_ptr_as_array) {
+      header.deref.packed_src_ssa_16bit =
+         deref->parent.is_ssa && deref->arr.index.is_ssa &&
+         are_object_ids_16bit(ctx);
+   }
+
+   write_dest(ctx, &deref->dest, header, deref->instr.type);
  
     switch (deref->deref_type) {
     case nir_deref_type_var:
@@ -880,8 +1022,15 @@ write_deref(write_ctx *ctx, const nir_deref_instr *deref)
  
     case nir_deref_type_array:
     case nir_deref_type_ptr_as_array:
-      write_src(ctx, &deref->parent);
-      write_src(ctx, &deref->arr.index);
+      if (header.deref.packed_src_ssa_16bit) {
+         blob_write_uint16(ctx->blob,
+                           write_lookup_object(ctx, deref->parent.ssa));
+         blob_write_uint16(ctx->blob,
+                           write_lookup_object(ctx, deref->arr.index.ssa));
+      } else {
+         write_src(ctx, &deref->parent);
+         write_src(ctx, &deref->arr.index);
+      }
        break;
  
     case nir_deref_type_cast:
@@ -931,13 +1080,21 @@ read_deref(read_ctx *ctx, union packed_instr header)
  
     case nir_deref_type_array:
     case nir_deref_type_ptr_as_array:
-      read_src(ctx, &deref->parent, &deref->instr);
+      if (header.deref.packed_src_ssa_16bit) {
+         deref->parent.is_ssa = true;
+         deref->parent.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob));
+         deref->arr.index.is_ssa = true;
+         deref->arr.index.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob));
+      } else {
+         read_src(ctx, &deref->parent, &deref->instr);
+         read_src(ctx, &deref->arr.index, &deref->instr);
+      }
+
        parent = nir_src_as_deref(deref->parent);
        if (deref->deref_type == nir_deref_type_array)
           deref->type = glsl_get_array_element(parent->type);
        else
           deref->type = parent->type;
-      read_src(ctx, &deref->arr.index, &deref->instr);
        break;
  
     case nir_deref_type_cast:
@@ -987,8 +1144,6 @@ write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin)
  
     header.intrinsic.instr_type = intrin->instr.type;
     header.intrinsic.intrinsic = intrin->intrinsic;
-   header.intrinsic.num_components =
-      encode_num_components_in_3bits(intrin->num_components);
  
     /* Analyze constant indices to decide how to encode them. */
     if (num_indices) {
@@ -998,11 +1153,11 @@ write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin)
           max_bits = MAX2(max_bits, max);
        }
  
-      if (max_bits * num_indices <= 6) {
-         header.intrinsic.const_indices_encoding = const_indices_6bit_all_combined;
+      if (max_bits * num_indices <= 9) {
+         header.intrinsic.const_indices_encoding = const_indices_9bit_all_combined;
  
           /* Pack all const indices into 6 bits. */
-         unsigned bit_size = 6 / num_indices;
+         unsigned bit_size = 9 / num_indices;
           for (unsigned i = 0; i < num_indices; i++) {
              header.intrinsic.packed_const_indices |=
                 intrin->const_index[i] << (i * bit_size);
@@ -1016,7 +1171,7 @@ write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin)
     }
  
     if (nir_intrinsic_infos[intrin->intrinsic].has_dest)
-      write_dest(ctx, &intrin->dest, header);
+      write_dest(ctx, &intrin->dest, header, intrin->instr.type);
     else
        blob_write_uint32(ctx->blob, header.u32);
  
@@ -1050,19 +1205,31 @@ read_intrinsic(read_ctx *ctx, union packed_instr header)
     unsigned num_srcs = nir_intrinsic_infos[op].num_srcs;
     unsigned num_indices = nir_intrinsic_infos[op].num_indices;
  
-   intrin->num_components =
-      decode_num_components_in_3bits(header.intrinsic.num_components);
-
     if (nir_intrinsic_infos[op].has_dest)
        read_dest(ctx, &intrin->dest, &intrin->instr, header);
  
     for (unsigned i = 0; i < num_srcs; i++)
        read_src(ctx, &intrin->src[i], &intrin->instr);
  
+   /* Vectorized instrinsics have num_components same as dst or src that has
+    * 0 components in the info. Find it.
+    */
+   if (nir_intrinsic_infos[op].has_dest &&
+       nir_intrinsic_infos[op].dest_components == 0) {
+      intrin->num_components = nir_dest_num_components(intrin->dest);
+   } else {
+      for (unsigned i = 0; i < num_srcs; i++) {
+         if (nir_intrinsic_infos[op].src_components[i] == 0) {
+            intrin->num_components = nir_src_num_components(intrin->src[i]);
+            break;
+         }
+      }
+   }
+
     if (num_indices) {
        switch (header.intrinsic.const_indices_encoding) {
-      case const_indices_6bit_all_combined: {
-         unsigned bit_size = 6 / num_indices;
+      case const_indices_9bit_all_combined: {
+         unsigned bit_size = 9 / num_indices;
           unsigned bit_mask = u_bit_consecutive(0, bit_size);
           for (unsigned i = 0; i < num_indices; i++) {
              intrin->const_index[i] =
@@ -1275,14 +1442,16 @@ read_ssa_undef(read_ctx *ctx, union packed_instr header)
  union packed_tex_data {
     uint32_t u32;
     struct {
-      enum glsl_sampler_dim sampler_dim:4;
-      nir_alu_type dest_type:8;
+      unsigned sampler_dim:4;
+      unsigned dest_type:8;
        unsigned coord_components:3;
        unsigned is_array:1;
        unsigned is_shadow:1;
        unsigned is_new_style_shadow:1;
        unsigned component:2;
-      unsigned unused:10; /* Mark unused for valgrind. */
+      unsigned texture_non_uniform:1;
+      unsigned sampler_non_uniform:1;
+      unsigned unused:8; /* Mark unused for valgrind. */
     } u;
  };
  
@@ -1291,7 +1460,6 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex)
  {
     assert(tex->num_srcs < 16);
     assert(tex->op < 16);
-   assert(tex->texture_array_size < 1024);
  
     union packed_instr header;
     header.u32 = 0;
@@ -1299,9 +1467,8 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex)
     header.tex.instr_type = tex->instr.type;
     header.tex.num_srcs = tex->num_srcs;
     header.tex.op = tex->op;
-   header.tex.texture_array_size = tex->texture_array_size;
  
-   write_dest(ctx, &tex->dest, header);
+   write_dest(ctx, &tex->dest, header, tex->instr.type);
  
     blob_write_uint32(ctx->blob, tex->texture_index);
     blob_write_uint32(ctx->blob, tex->sampler_index);
@@ -1317,6 +1484,8 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex)
        .u.is_shadow = tex->is_shadow,
        .u.is_new_style_shadow = tex->is_new_style_shadow,
        .u.component = tex->component,
+      .u.texture_non_uniform = tex->texture_non_uniform,
+      .u.sampler_non_uniform = tex->sampler_non_uniform,
     };
     blob_write_uint32(ctx->blob, packed.u32);
  
@@ -1337,7 +1506,6 @@ read_tex(read_ctx *ctx, union packed_instr header)
  
     tex->op = header.tex.op;
     tex->texture_index = blob_read_uint32(ctx->blob);
-   tex->texture_array_size = header.tex.texture_array_size;
     tex->sampler_index = blob_read_uint32(ctx->blob);
     if (tex->op == nir_texop_tg4)
        blob_copy_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets));
@@ -1351,6 +1519,8 @@ read_tex(read_ctx *ctx, union packed_instr header)
     tex->is_shadow = packed.u.is_shadow;
     tex->is_new_style_shadow = packed.u.is_new_style_shadow;
     tex->component = packed.u.component;
+   tex->texture_non_uniform = packed.u.texture_non_uniform;
+   tex->sampler_non_uniform = packed.u.sampler_non_uniform;
  
     for (unsigned i = 0; i < tex->num_srcs; i++) {
        union packed_src src = read_src(ctx, &tex->src[i].src, &tex->instr);
@@ -1374,7 +1544,7 @@ write_phi(write_ctx *ctx, const nir_phi_instr *phi)
      * and then store enough information so that a later fixup pass can fill
      * them in correctly.
      */
-   write_dest(ctx, &phi->dest, header);
+   write_dest(ctx, &phi->dest, header, phi->instr.type);
  
     nir_foreach_phi_src(src, phi) {
        assert(src->src.is_ssa);
@@ -1542,7 +1712,8 @@ write_instr(write_ctx *ctx, const nir_instr *instr)
     }
  }
  
-static void
+/* Return the number of instructions read. */
+static unsigned
  read_instr(read_ctx *ctx, nir_block *block)
  {
     STATIC_ASSERT(sizeof(union packed_instr) == 4);
@@ -1552,8 +1723,9 @@ read_instr(read_ctx *ctx, nir_block *block)
  
     switch (header.any.instr_type) {
     case nir_instr_type_alu:
-      instr = &read_alu(ctx, header)->instr;
-      break;
+      for (unsigned i = 0; i <= header.alu.num_followup_alu_sharing_header; i++)
+         nir_instr_insert_after_block(block, &read_alu(ctx, header)->instr);
+      return header.alu.num_followup_alu_sharing_header + 1;
     case nir_instr_type_deref:
        instr = &read_deref(ctx, header)->instr;
        break;
@@ -1576,7 +1748,7 @@ read_instr(read_ctx *ctx, nir_block *block)
         * are read so that we can set their sources up.
         */
        read_phi(ctx, block, header);
-      return;
+      return 1;
     case nir_instr_type_jump:
        instr = &read_jump(ctx, header)->instr;
        break;
@@ -1590,6 +1762,7 @@ read_instr(read_ctx *ctx, nir_block *block)
     }
  
     nir_instr_insert_after_block(block, instr);
+   return 1;
  }
  
  static void
@@ -1597,8 +1770,14 @@ write_block(write_ctx *ctx, const nir_block *block)
  {
     write_add_object(ctx, block);
     blob_write_uint32(ctx->blob, exec_list_length(&block->instr_list));
-   nir_foreach_instr(instr, block)
+
+   ctx->last_instr_type = ~0;
+   ctx->last_alu_header_offset = 0;
+
+   nir_foreach_instr(instr, block) {
        write_instr(ctx, instr);
+      ctx->last_instr_type = instr->type;
+   }
  }
  
  static void
@@ -1613,8 +1792,8 @@ read_block(read_ctx *ctx, struct exec_list *cf_list)
  
     read_add_object(ctx, block);
     unsigned num_instrs = blob_read_uint32(ctx->blob);
-   for (unsigned i = 0; i < num_instrs; i++) {
-      read_instr(ctx, block);
+   for (unsigned i = 0; i < num_instrs;) {
+      i += read_instr(ctx, block);
     }
  }
  
@@ -1722,6 +1901,8 @@ read_cf_list(read_ctx *ctx, struct exec_list *cf_list)
  static void
  write_function_impl(write_ctx *ctx, const nir_function_impl *fi)
  {
+   blob_write_uint8(ctx->blob, fi->structured);
+
     write_var_list(ctx, &fi->locals);
     write_reg_list(ctx, &fi->registers);
     blob_write_uint32(ctx->blob, fi->reg_alloc);
@@ -1736,6 +1917,8 @@ read_function_impl(read_ctx *ctx, nir_function *fxn)
     nir_function_impl *fi = nir_function_impl_create_bare(ctx->nir);
     fi->function = fxn;
  
+   fi->structured = blob_read_uint8(ctx->blob);
+
     read_var_list(ctx, &fi->locals);
     read_reg_list(ctx, &fi->registers);
     fi->reg_alloc = blob_read_uint32(ctx->blob);
@@ -1834,12 +2017,7 @@ nir_serialize(struct blob *blob, const nir_shader *nir, bool strip)
     info.name = info.label = NULL;
     blob_write_bytes(blob, (uint8_t *) &info, sizeof(info));
  
-   write_var_list(&ctx, &nir->uniforms);
-   write_var_list(&ctx, &nir->inputs);
-   write_var_list(&ctx, &nir->outputs);
-   write_var_list(&ctx, &nir->shared);
-   write_var_list(&ctx, &nir->globals);
-   write_var_list(&ctx, &nir->system_values);
+   write_var_list(&ctx, &nir->variables);
  
     blob_write_uint32(blob, nir->num_inputs);
     blob_write_uint32(blob, nir->num_uniforms);
@@ -1892,12 +2070,7 @@ nir_deserialize(void *mem_ctx,
  
     ctx.nir->info = info;
  
-   read_var_list(&ctx, &ctx.nir->uniforms);
-   read_var_list(&ctx, &ctx.nir->inputs);
-   read_var_list(&ctx, &ctx.nir->outputs);
-   read_var_list(&ctx, &ctx.nir->shared);
-   read_var_list(&ctx, &ctx.nir->globals);
-   read_var_list(&ctx, &ctx.nir->system_values);
+   read_var_list(&ctx, &ctx.nir->variables);
  
     ctx.nir->num_inputs = blob_read_uint32(blob);
     ctx.nir->num_uniforms = blob_read_uint32(blob);