X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fcompiler%2Fnir%2Fnir_serialize.c;h=6688e9e8b172260ed05986b4f7052238774601e6;hb=11470fcde266aa8b864b6a114fc923b2b8e5907a;hp=54a00c81d66c29da0ab604be837ac9bacb1b0862;hpb=57372c5a42969afe6c7afd6a0389a92e3e1a5178;p=mesa.git diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c index 54a00c81d66..6688e9e8b17 100644 --- a/src/compiler/nir/nir_serialize.c +++ b/src/compiler/nir/nir_serialize.c @@ -24,6 +24,10 @@ #include "nir_serialize.h" #include "nir_control_flow.h" #include "util/u_dynarray.h" +#include "util/u_math.h" + +#define NIR_SERIALIZE_FUNC_HAS_IMPL ((void *)(intptr_t)1) +#define MAX_OBJECT_IDS (1 << 20) typedef struct { size_t blob_offset; @@ -40,12 +44,24 @@ typedef struct { struct hash_table *remap_table; /* the next index to assign to a NIR in-memory object */ - uintptr_t next_idx; + uint32_t next_idx; /* Array of write_phi_fixup structs representing phi sources that need to * be resolved in the second pass. */ struct util_dynarray phi_fixups; + + /* The last serialized type. */ + const struct glsl_type *last_type; + const struct glsl_type *last_interface_type; + struct nir_variable_data last_var_data; + + /* For skipping equal ALU headers (typical after scalarization). */ + nir_instr_type last_instr_type; + uintptr_t last_alu_header_offset; + + /* Don't write optional data such as variable names. */ + bool strip; } write_ctx; typedef struct { @@ -54,10 +70,10 @@ typedef struct { struct blob_reader *blob; /* the next index to assign to a NIR in-memory object */ - uintptr_t next_idx; + uint32_t next_idx; /* The length of the index -> object table */ - uintptr_t idx_table_len; + uint32_t idx_table_len; /* map from index to deserialized pointer */ void **idx_table; @@ -65,27 +81,26 @@ typedef struct { /* List of phi sources. */ struct list_head phi_srcs; + /* The last deserialized type. */ + const struct glsl_type *last_type; + const struct glsl_type *last_interface_type; + struct nir_variable_data last_var_data; } read_ctx; static void write_add_object(write_ctx *ctx, const void *obj) { - uintptr_t index = ctx->next_idx++; - _mesa_hash_table_insert(ctx->remap_table, obj, (void *) index); + uint32_t index = ctx->next_idx++; + assert(index != MAX_OBJECT_IDS); + _mesa_hash_table_insert(ctx->remap_table, obj, (void *)(uintptr_t) index); } -static uintptr_t +static uint32_t write_lookup_object(write_ctx *ctx, const void *obj) { struct hash_entry *entry = _mesa_hash_table_search(ctx->remap_table, obj); assert(entry); - return (uintptr_t) entry->data; -} - -static void -write_object(write_ctx *ctx, const void *obj) -{ - blob_write_intptr(ctx->blob, write_lookup_object(ctx, obj)); + return (uint32_t)(uintptr_t) entry->data; } static void @@ -96,7 +111,7 @@ read_add_object(read_ctx *ctx, void *obj) } static void * -read_lookup_object(read_ctx *ctx, uintptr_t idx) +read_lookup_object(read_ctx *ctx, uint32_t idx) { assert(idx < ctx->idx_table_len); return ctx->idx_table[idx]; @@ -105,7 +120,55 @@ read_lookup_object(read_ctx *ctx, uintptr_t idx) static void * read_object(read_ctx *ctx) { - return read_lookup_object(ctx, blob_read_intptr(ctx->blob)); + return read_lookup_object(ctx, blob_read_uint32(ctx->blob)); +} + +static uint32_t +encode_bit_size_3bits(uint8_t bit_size) +{ + /* Encode values of 0, 1, 2, 4, 8, 16, 32, 64 in 3 bits. */ + assert(bit_size <= 64 && util_is_power_of_two_or_zero(bit_size)); + if (bit_size) + return util_logbase2(bit_size) + 1; + return 0; +} + +static uint8_t +decode_bit_size_3bits(uint8_t bit_size) +{ + if (bit_size) + return 1 << (bit_size - 1); + return 0; +} + +#define NUM_COMPONENTS_IS_SEPARATE_7 7 + +static uint8_t +encode_num_components_in_3bits(uint8_t num_components) +{ + if (num_components <= 4) + return num_components; + if (num_components == 8) + return 5; + if (num_components == 16) + return 6; + + /* special value indicating that num_components is in the next uint32 */ + return NUM_COMPONENTS_IS_SEPARATE_7; +} + +static uint8_t +decode_num_components_in_3bits(uint8_t value) +{ + if (value <= 4) + return value; + if (value == 5) + return 8; + if (value == 6) + return 16; + + unreachable("invalid num_components encoding"); + return 0; } static void @@ -124,30 +187,149 @@ read_constant(read_ctx *ctx, nir_variable *nvar) blob_copy_bytes(ctx->blob, (uint8_t *)c->values, sizeof(c->values)); c->num_elements = blob_read_uint32(ctx->blob); - c->elements = ralloc_array(ctx->nir, nir_constant *, c->num_elements); + c->elements = ralloc_array(nvar, nir_constant *, c->num_elements); for (unsigned i = 0; i < c->num_elements; i++) c->elements[i] = read_constant(ctx, nvar); return c; } +enum var_data_encoding { + var_encode_full, + var_encode_shader_temp, + var_encode_function_temp, + var_encode_location_diff, +}; + +union packed_var { + uint32_t u32; + struct { + unsigned has_name:1; + unsigned has_constant_initializer:1; + unsigned has_pointer_initializer:1; + unsigned has_interface_type:1; + unsigned num_state_slots:7; + unsigned data_encoding:2; + unsigned type_same_as_last:1; + unsigned interface_type_same_as_last:1; + unsigned _pad:1; + unsigned num_members:16; + } u; +}; + +union packed_var_data_diff { + uint32_t u32; + struct { + int location:13; + int location_frac:3; + int driver_location:16; + } u; +}; + static void write_variable(write_ctx *ctx, const nir_variable *var) { write_add_object(ctx, var); - encode_type_to_blob(ctx->blob, var->type); - blob_write_uint32(ctx->blob, !!(var->name)); - blob_write_string(ctx->blob, var->name); - blob_write_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); - blob_write_uint32(ctx->blob, var->num_state_slots); - blob_write_bytes(ctx->blob, (uint8_t *) var->state_slots, - var->num_state_slots * sizeof(nir_state_slot)); - blob_write_uint32(ctx->blob, !!(var->constant_initializer)); + + assert(var->num_state_slots < (1 << 7)); + + STATIC_ASSERT(sizeof(union packed_var) == 4); + union packed_var flags; + flags.u32 = 0; + + flags.u.has_name = !ctx->strip && var->name; + flags.u.has_constant_initializer = !!(var->constant_initializer); + flags.u.has_pointer_initializer = !!(var->pointer_initializer); + flags.u.has_interface_type = !!(var->interface_type); + flags.u.type_same_as_last = var->type == ctx->last_type; + flags.u.interface_type_same_as_last = + var->interface_type && var->interface_type == ctx->last_interface_type; + flags.u.num_state_slots = var->num_state_slots; + flags.u.num_members = var->num_members; + + struct nir_variable_data data = var->data; + + /* When stripping, we expect that the location is no longer needed, + * which is typically after shaders are linked. + */ + if (ctx->strip && + data.mode != nir_var_shader_in && + data.mode != nir_var_shader_out) + data.location = 0; + + /* Temporary variables don't serialize var->data. */ + if (data.mode == nir_var_shader_temp) + flags.u.data_encoding = var_encode_shader_temp; + else if (data.mode == nir_var_function_temp) + flags.u.data_encoding = var_encode_function_temp; + else { + struct nir_variable_data tmp = data; + + tmp.location = ctx->last_var_data.location; + tmp.location_frac = ctx->last_var_data.location_frac; + tmp.driver_location = ctx->last_var_data.driver_location; + + /* See if we can encode only the difference in locations from the last + * variable. + */ + if (memcmp(&ctx->last_var_data, &tmp, sizeof(tmp)) == 0 && + abs((int)data.location - + (int)ctx->last_var_data.location) < (1 << 12) && + abs((int)data.driver_location - + (int)ctx->last_var_data.driver_location) < (1 << 15)) + flags.u.data_encoding = var_encode_location_diff; + else + flags.u.data_encoding = var_encode_full; + } + + blob_write_uint32(ctx->blob, flags.u32); + + if (!flags.u.type_same_as_last) { + encode_type_to_blob(ctx->blob, var->type); + ctx->last_type = var->type; + } + + if (var->interface_type && !flags.u.interface_type_same_as_last) { + encode_type_to_blob(ctx->blob, var->interface_type); + ctx->last_interface_type = var->interface_type; + } + + if (flags.u.has_name) + blob_write_string(ctx->blob, var->name); + + if (flags.u.data_encoding == var_encode_full || + flags.u.data_encoding == var_encode_location_diff) { + if (flags.u.data_encoding == var_encode_full) { + blob_write_bytes(ctx->blob, &data, sizeof(data)); + } else { + /* Serialize only the difference in locations from the last variable. + */ + union packed_var_data_diff diff; + + diff.u.location = data.location - ctx->last_var_data.location; + diff.u.location_frac = data.location_frac - + ctx->last_var_data.location_frac; + diff.u.driver_location = data.driver_location - + ctx->last_var_data.driver_location; + + blob_write_uint32(ctx->blob, diff.u32); + } + + ctx->last_var_data = data; + } + + for (unsigned i = 0; i < var->num_state_slots; i++) { + blob_write_bytes(ctx->blob, &var->state_slots[i], + sizeof(var->state_slots[i])); + } if (var->constant_initializer) write_constant(ctx, var->constant_initializer); - blob_write_uint32(ctx->blob, !!(var->interface_type)); - if (var->interface_type) - encode_type_to_blob(ctx->blob, var->interface_type); + if (var->pointer_initializer) + write_lookup_object(ctx, var->pointer_initializer); + if (var->num_members > 0) { + blob_write_bytes(ctx->blob, (uint8_t *) var->members, + var->num_members * sizeof(*var->members)); + } } static nir_variable * @@ -156,29 +338,77 @@ read_variable(read_ctx *ctx) nir_variable *var = rzalloc(ctx->nir, nir_variable); read_add_object(ctx, var); - var->type = decode_type_from_blob(ctx->blob); - bool has_name = blob_read_uint32(ctx->blob); - if (has_name) { + union packed_var flags; + flags.u32 = blob_read_uint32(ctx->blob); + + if (flags.u.type_same_as_last) { + var->type = ctx->last_type; + } else { + var->type = decode_type_from_blob(ctx->blob); + ctx->last_type = var->type; + } + + if (flags.u.has_interface_type) { + if (flags.u.interface_type_same_as_last) { + var->interface_type = ctx->last_interface_type; + } else { + var->interface_type = decode_type_from_blob(ctx->blob); + ctx->last_interface_type = var->interface_type; + } + } + + if (flags.u.has_name) { const char *name = blob_read_string(ctx->blob); var->name = ralloc_strdup(var, name); } else { var->name = NULL; } - blob_copy_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); - var->num_state_slots = blob_read_uint32(ctx->blob); - var->state_slots = ralloc_array(var, nir_state_slot, var->num_state_slots); - blob_copy_bytes(ctx->blob, (uint8_t *) var->state_slots, - var->num_state_slots * sizeof(nir_state_slot)); - bool has_const_initializer = blob_read_uint32(ctx->blob); - if (has_const_initializer) + + if (flags.u.data_encoding == var_encode_shader_temp) + var->data.mode = nir_var_shader_temp; + else if (flags.u.data_encoding == var_encode_function_temp) + var->data.mode = nir_var_function_temp; + else if (flags.u.data_encoding == var_encode_full) { + blob_copy_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); + ctx->last_var_data = var->data; + } else { /* var_encode_location_diff */ + union packed_var_data_diff diff; + diff.u32 = blob_read_uint32(ctx->blob); + + var->data = ctx->last_var_data; + var->data.location += diff.u.location; + var->data.location_frac += diff.u.location_frac; + var->data.driver_location += diff.u.driver_location; + + ctx->last_var_data = var->data; + } + + var->num_state_slots = flags.u.num_state_slots; + if (var->num_state_slots != 0) { + var->state_slots = ralloc_array(var, nir_state_slot, + var->num_state_slots); + for (unsigned i = 0; i < var->num_state_slots; i++) { + blob_copy_bytes(ctx->blob, &var->state_slots[i], + sizeof(var->state_slots[i])); + } + } + if (flags.u.has_constant_initializer) var->constant_initializer = read_constant(ctx, var); else var->constant_initializer = NULL; - bool has_interface_type = blob_read_uint32(ctx->blob); - if (has_interface_type) - var->interface_type = decode_type_from_blob(ctx->blob); + + if (flags.u.has_pointer_initializer) + var->pointer_initializer = read_object(ctx); else - var->interface_type = NULL; + var->pointer_initializer = NULL; + + var->num_members = flags.u.num_members; + if (var->num_members > 0) { + var->members = ralloc_array(var, struct nir_variable_data, + var->num_members); + blob_copy_bytes(ctx->blob, (uint8_t *) var->members, + var->num_members * sizeof(*var->members)); + } return var; } @@ -211,10 +441,9 @@ write_register(write_ctx *ctx, const nir_register *reg) blob_write_uint32(ctx->blob, reg->bit_size); blob_write_uint32(ctx->blob, reg->num_array_elems); blob_write_uint32(ctx->blob, reg->index); - blob_write_uint32(ctx->blob, !!(reg->name)); - if (reg->name) + blob_write_uint32(ctx->blob, !ctx->strip && reg->name); + if (!ctx->strip && reg->name) blob_write_string(ctx->blob, reg->name); - blob_write_uint32(ctx->blob, reg->is_global << 1 | reg->is_packed); } static nir_register * @@ -233,9 +462,6 @@ read_register(read_ctx *ctx) } else { reg->name = NULL; } - unsigned flags = blob_read_uint32(ctx->blob); - reg->is_global = flags & 0x2; - reg->is_packed = flags & 0x1; list_inithead(®->uses); list_inithead(®->defs); @@ -263,8 +489,32 @@ read_reg_list(read_ctx *ctx, struct exec_list *dst) } } +union packed_src { + uint32_t u32; + struct { + unsigned is_ssa:1; /* <-- Header */ + unsigned is_indirect:1; + unsigned object_idx:20; + unsigned _footer:10; /* <-- Footer */ + } any; + struct { + unsigned _header:22; /* <-- Header */ + unsigned negate:1; /* <-- Footer */ + unsigned abs:1; + unsigned swizzle_x:2; + unsigned swizzle_y:2; + unsigned swizzle_z:2; + unsigned swizzle_w:2; + } alu; + struct { + unsigned _header:22; /* <-- Header */ + unsigned src_type:5; /* <-- Footer */ + unsigned _pad:5; + } tex; +}; + static void -write_src(write_ctx *ctx, const nir_src *src) +write_src_full(write_ctx *ctx, const nir_src *src, union packed_src header) { /* Since sources are very frequent, we try to save some space when storing * them. In particular, we store whether the source is a register and @@ -272,61 +522,237 @@ write_src(write_ctx *ctx, const nir_src *src) * assume that the high two bits of the index are zero, since otherwise our * address space would've been exhausted allocating the remap table! */ + header.any.is_ssa = src->is_ssa; if (src->is_ssa) { - uintptr_t idx = write_lookup_object(ctx, src->ssa) << 2; - idx |= 1; - blob_write_intptr(ctx->blob, idx); + header.any.object_idx = write_lookup_object(ctx, src->ssa); + blob_write_uint32(ctx->blob, header.u32); } else { - uintptr_t idx = write_lookup_object(ctx, src->reg.reg) << 2; - if (src->reg.indirect) - idx |= 2; - blob_write_intptr(ctx->blob, idx); + header.any.object_idx = write_lookup_object(ctx, src->reg.reg); + header.any.is_indirect = !!src->reg.indirect; + blob_write_uint32(ctx->blob, header.u32); blob_write_uint32(ctx->blob, src->reg.base_offset); if (src->reg.indirect) { - write_src(ctx, src->reg.indirect); + union packed_src header = {0}; + write_src_full(ctx, src->reg.indirect, header); } } } static void +write_src(write_ctx *ctx, const nir_src *src) +{ + union packed_src header = {0}; + write_src_full(ctx, src, header); +} + +static union packed_src read_src(read_ctx *ctx, nir_src *src, void *mem_ctx) { - uintptr_t val = blob_read_intptr(ctx->blob); - uintptr_t idx = val >> 2; - src->is_ssa = val & 0x1; + STATIC_ASSERT(sizeof(union packed_src) == 4); + union packed_src header; + header.u32 = blob_read_uint32(ctx->blob); + + src->is_ssa = header.any.is_ssa; if (src->is_ssa) { - src->ssa = read_lookup_object(ctx, idx); + src->ssa = read_lookup_object(ctx, header.any.object_idx); } else { - bool is_indirect = val & 0x2; - src->reg.reg = read_lookup_object(ctx, idx); + src->reg.reg = read_lookup_object(ctx, header.any.object_idx); src->reg.base_offset = blob_read_uint32(ctx->blob); - if (is_indirect) { + if (header.any.is_indirect) { src->reg.indirect = ralloc(mem_ctx, nir_src); read_src(ctx, src->reg.indirect, mem_ctx); } else { src->reg.indirect = NULL; } } + return header; } +union packed_dest { + uint8_t u8; + struct { + uint8_t is_ssa:1; + uint8_t has_name:1; + uint8_t num_components:3; + uint8_t bit_size:3; + } ssa; + struct { + uint8_t is_ssa:1; + uint8_t is_indirect:1; + uint8_t _pad:6; + } reg; +}; + +enum intrinsic_const_indices_encoding { + /* Use the 9 bits of packed_const_indices to store 1-9 indices. + * 1 9-bit index, or 2 4-bit indices, or 3 3-bit indices, or + * 4 2-bit indices, or 5-9 1-bit indices. + * + * The common case for load_ubo is 0, 0, 0, which is trivially represented. + * The common cases for load_interpolated_input also fit here, e.g.: 7, 3 + */ + const_indices_9bit_all_combined, + + const_indices_8bit, /* 8 bits per element */ + const_indices_16bit, /* 16 bits per element */ + const_indices_32bit, /* 32 bits per element */ +}; + +enum load_const_packing { + /* Constants are not packed and are stored in following dwords. */ + load_const_full, + + /* packed_value contains high 19 bits, low bits are 0, + * good for floating-point decimals + */ + load_const_scalar_hi_19bits, + + /* packed_value contains low 19 bits, high bits are sign-extended */ + load_const_scalar_lo_19bits_sext, +}; + +union packed_instr { + uint32_t u32; + struct { + unsigned instr_type:4; /* always present */ + unsigned _pad:20; + unsigned dest:8; /* always last */ + } any; + struct { + unsigned instr_type:4; + unsigned exact:1; + unsigned no_signed_wrap:1; + unsigned no_unsigned_wrap:1; + unsigned saturate:1; + /* Reg: writemask; SSA: swizzles for 2 srcs */ + unsigned writemask_or_two_swizzles:4; + unsigned op:9; + unsigned packed_src_ssa_16bit:1; + /* Scalarized ALUs always have the same header. */ + unsigned num_followup_alu_sharing_header:2; + unsigned dest:8; + } alu; + struct { + unsigned instr_type:4; + unsigned deref_type:3; + unsigned cast_type_same_as_last:1; + unsigned mode:10; /* deref_var redefines this */ + unsigned packed_src_ssa_16bit:1; /* deref_var redefines this */ + unsigned _pad:5; /* deref_var redefines this */ + unsigned dest:8; + } deref; + struct { + unsigned instr_type:4; + unsigned deref_type:3; + unsigned _pad:1; + unsigned object_idx:16; /* if 0, the object ID is a separate uint32 */ + unsigned dest:8; + } deref_var; + struct { + unsigned instr_type:4; + unsigned intrinsic:9; + unsigned const_indices_encoding:2; + unsigned packed_const_indices:9; + unsigned dest:8; + } intrinsic; + struct { + unsigned instr_type:4; + unsigned last_component:4; + unsigned bit_size:3; + unsigned packing:2; /* enum load_const_packing */ + unsigned packed_value:19; /* meaning determined by packing */ + } load_const; + struct { + unsigned instr_type:4; + unsigned last_component:4; + unsigned bit_size:3; + unsigned _pad:21; + } undef; + struct { + unsigned instr_type:4; + unsigned num_srcs:4; + unsigned op:4; + unsigned dest:8; + unsigned _pad:12; + } tex; + struct { + unsigned instr_type:4; + unsigned num_srcs:20; + unsigned dest:8; + } phi; + struct { + unsigned instr_type:4; + unsigned type:2; + unsigned _pad:26; + } jump; +}; + +/* Write "lo24" as low 24 bits in the first uint32. */ static void -write_dest(write_ctx *ctx, const nir_dest *dst) +write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header, + nir_instr_type instr_type) { - uint32_t val = dst->is_ssa; + STATIC_ASSERT(sizeof(union packed_dest) == 1); + union packed_dest dest; + dest.u8 = 0; + + dest.ssa.is_ssa = dst->is_ssa; if (dst->is_ssa) { - val |= !!(dst->ssa.name) << 1; - val |= dst->ssa.num_components << 2; - val |= dst->ssa.bit_size << 5; + dest.ssa.has_name = !ctx->strip && dst->ssa.name; + dest.ssa.num_components = + encode_num_components_in_3bits(dst->ssa.num_components); + dest.ssa.bit_size = encode_bit_size_3bits(dst->ssa.bit_size); } else { - val |= !!(dst->reg.indirect) << 1; + dest.reg.is_indirect = !!(dst->reg.indirect); } - blob_write_uint32(ctx->blob, val); + header.any.dest = dest.u8; + + /* Check if the current ALU instruction has the same header as the previous + * instruction that is also ALU. If it is, we don't have to write + * the current header. This is a typical occurence after scalarization. + */ + if (instr_type == nir_instr_type_alu) { + bool equal_header = false; + + if (ctx->last_instr_type == nir_instr_type_alu) { + assert(ctx->last_alu_header_offset); + union packed_instr *last_header = + (union packed_instr *)(ctx->blob->data + + ctx->last_alu_header_offset); + + /* Clear the field that counts ALUs with equal headers. */ + union packed_instr clean_header; + clean_header.u32 = last_header->u32; + clean_header.alu.num_followup_alu_sharing_header = 0; + + /* There can be at most 4 consecutive ALU instructions + * sharing the same header. + */ + if (last_header->alu.num_followup_alu_sharing_header < 3 && + header.u32 == clean_header.u32) { + last_header->alu.num_followup_alu_sharing_header++; + equal_header = true; + } + } + + if (!equal_header) { + ctx->last_alu_header_offset = ctx->blob->size; + blob_write_uint32(ctx->blob, header.u32); + } + } else { + blob_write_uint32(ctx->blob, header.u32); + } + + if (dest.ssa.is_ssa && + dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7) + blob_write_uint32(ctx->blob, dst->ssa.num_components); + if (dst->is_ssa) { write_add_object(ctx, &dst->ssa); - if (dst->ssa.name) + if (dest.ssa.has_name) blob_write_string(ctx->blob, dst->ssa.name); } else { - blob_write_intptr(ctx->blob, write_lookup_object(ctx, dst->reg.reg)); + blob_write_uint32(ctx->blob, write_lookup_object(ctx, dst->reg.reg)); blob_write_uint32(ctx->blob, dst->reg.base_offset); if (dst->reg.indirect) write_src(ctx, dst->reg.indirect); @@ -334,197 +760,498 @@ write_dest(write_ctx *ctx, const nir_dest *dst) } static void -read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr) -{ - uint32_t val = blob_read_uint32(ctx->blob); - bool is_ssa = val & 0x1; - if (is_ssa) { - bool has_name = val & 0x2; - unsigned num_components = (val >> 2) & 0x7; - unsigned bit_size = val >> 5; - char *name = has_name ? blob_read_string(ctx->blob) : NULL; +read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr, + union packed_instr header) +{ + union packed_dest dest; + dest.u8 = header.any.dest; + + if (dest.ssa.is_ssa) { + unsigned bit_size = decode_bit_size_3bits(dest.ssa.bit_size); + unsigned num_components; + if (dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7) + num_components = blob_read_uint32(ctx->blob); + else + num_components = decode_num_components_in_3bits(dest.ssa.num_components); + char *name = dest.ssa.has_name ? blob_read_string(ctx->blob) : NULL; nir_ssa_dest_init(instr, dst, num_components, bit_size, name); read_add_object(ctx, &dst->ssa); } else { - bool is_indirect = val & 0x2; dst->reg.reg = read_object(ctx); dst->reg.base_offset = blob_read_uint32(ctx->blob); - if (is_indirect) { + if (dest.reg.is_indirect) { dst->reg.indirect = ralloc(instr, nir_src); read_src(ctx, dst->reg.indirect, instr); } } } -static void -write_deref_chain(write_ctx *ctx, const nir_deref_var *deref_var) -{ - write_object(ctx, deref_var->var); - - uint32_t len = 0; - for (const nir_deref *d = deref_var->deref.child; d; d = d->child) - len++; - blob_write_uint32(ctx->blob, len); - - for (const nir_deref *d = deref_var->deref.child; d; d = d->child) { - blob_write_uint32(ctx->blob, d->deref_type); - switch (d->deref_type) { - case nir_deref_type_array: { - const nir_deref_array *deref_array = nir_deref_as_array(d); - blob_write_uint32(ctx->blob, deref_array->deref_array_type); - blob_write_uint32(ctx->blob, deref_array->base_offset); - if (deref_array->deref_array_type == nir_deref_array_type_indirect) - write_src(ctx, &deref_array->indirect); - break; +static bool +are_object_ids_16bit(write_ctx *ctx) +{ + /* Check the highest object ID, because they are monotonic. */ + return ctx->next_idx < (1 << 16); +} + +static bool +is_alu_src_ssa_16bit(write_ctx *ctx, const nir_alu_instr *alu) +{ + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + + for (unsigned i = 0; i < num_srcs; i++) { + if (!alu->src[i].src.is_ssa || alu->src[i].abs || alu->src[i].negate) + return false; + + unsigned src_components = nir_ssa_alu_instr_src_components(alu, i); + + for (unsigned chan = 0; chan < src_components; chan++) { + /* The swizzles for src0.x and src1.x are stored + * in writemask_or_two_swizzles for SSA ALUs. + */ + if (alu->dest.dest.is_ssa && i < 2 && chan == 0 && + alu->src[i].swizzle[chan] < 4) + continue; + + if (alu->src[i].swizzle[chan] != chan) + return false; } - case nir_deref_type_struct: { - const nir_deref_struct *deref_struct = nir_deref_as_struct(d); - blob_write_uint32(ctx->blob, deref_struct->index); - break; + } + + return are_object_ids_16bit(ctx); +} + +static void +write_alu(write_ctx *ctx, const nir_alu_instr *alu) +{ + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + unsigned dst_components = nir_dest_num_components(alu->dest.dest); + + /* 9 bits for nir_op */ + STATIC_ASSERT(nir_num_opcodes <= 512); + union packed_instr header; + header.u32 = 0; + + header.alu.instr_type = alu->instr.type; + header.alu.exact = alu->exact; + header.alu.no_signed_wrap = alu->no_signed_wrap; + header.alu.no_unsigned_wrap = alu->no_unsigned_wrap; + header.alu.saturate = alu->dest.saturate; + header.alu.op = alu->op; + header.alu.packed_src_ssa_16bit = is_alu_src_ssa_16bit(ctx, alu); + + if (header.alu.packed_src_ssa_16bit && + alu->dest.dest.is_ssa) { + /* For packed srcs of SSA ALUs, this field stores the swizzles. */ + header.alu.writemask_or_two_swizzles = alu->src[0].swizzle[0]; + if (num_srcs > 1) + header.alu.writemask_or_two_swizzles |= alu->src[1].swizzle[0] << 2; + } else if (!alu->dest.dest.is_ssa && dst_components <= 4) { + /* For vec4 registers, this field is a writemask. */ + header.alu.writemask_or_two_swizzles = alu->dest.write_mask; + } + + write_dest(ctx, &alu->dest.dest, header, alu->instr.type); + + if (!alu->dest.dest.is_ssa && dst_components > 4) + blob_write_uint32(ctx->blob, alu->dest.write_mask); + + if (header.alu.packed_src_ssa_16bit) { + for (unsigned i = 0; i < num_srcs; i++) { + assert(alu->src[i].src.is_ssa); + unsigned idx = write_lookup_object(ctx, alu->src[i].src.ssa); + assert(idx < (1 << 16)); + blob_write_uint16(ctx->blob, idx); } - case nir_deref_type_var: - unreachable("Invalid deref type"); + } else { + for (unsigned i = 0; i < num_srcs; i++) { + unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i); + unsigned src_components = nir_src_num_components(alu->src[i].src); + union packed_src src; + bool packed = src_components <= 4 && src_channels <= 4; + src.u32 = 0; + + src.alu.negate = alu->src[i].negate; + src.alu.abs = alu->src[i].abs; + + if (packed) { + src.alu.swizzle_x = alu->src[i].swizzle[0]; + src.alu.swizzle_y = alu->src[i].swizzle[1]; + src.alu.swizzle_z = alu->src[i].swizzle[2]; + src.alu.swizzle_w = alu->src[i].swizzle[3]; + } + + write_src_full(ctx, &alu->src[i].src, src); + + /* Store swizzles for vec8 and vec16. */ + if (!packed) { + for (unsigned o = 0; o < src_channels; o += 8) { + unsigned value = 0; + + for (unsigned j = 0; j < 8 && o + j < src_channels; j++) { + value |= (uint32_t)alu->src[i].swizzle[o + j] << + (4 * j); /* 4 bits per swizzle */ + } + + blob_write_uint32(ctx->blob, value); + } + } } - - encode_type_to_blob(ctx->blob, d->type); } } -static nir_deref_var * -read_deref_chain(read_ctx *ctx, void *mem_ctx) +static nir_alu_instr * +read_alu(read_ctx *ctx, union packed_instr header) { - nir_variable *var = read_object(ctx); - nir_deref_var *deref_var = nir_deref_var_create(mem_ctx, var); + unsigned num_srcs = nir_op_infos[header.alu.op].num_inputs; + nir_alu_instr *alu = nir_alu_instr_create(ctx->nir, header.alu.op); - uint32_t len = blob_read_uint32(ctx->blob); + alu->exact = header.alu.exact; + alu->no_signed_wrap = header.alu.no_signed_wrap; + alu->no_unsigned_wrap = header.alu.no_unsigned_wrap; + alu->dest.saturate = header.alu.saturate; - nir_deref *tail = &deref_var->deref; - for (uint32_t i = 0; i < len; i++) { - nir_deref_type deref_type = blob_read_uint32(ctx->blob); - nir_deref *deref = NULL; - switch (deref_type) { - case nir_deref_type_array: { - nir_deref_array *deref_array = nir_deref_array_create(tail); - deref_array->deref_array_type = blob_read_uint32(ctx->blob); - deref_array->base_offset = blob_read_uint32(ctx->blob); - if (deref_array->deref_array_type == nir_deref_array_type_indirect) - read_src(ctx, &deref_array->indirect, mem_ctx); - deref = &deref_array->deref; - break; - } - case nir_deref_type_struct: { - uint32_t index = blob_read_uint32(ctx->blob); - nir_deref_struct *deref_struct = nir_deref_struct_create(tail, index); - deref = &deref_struct->deref; - break; + read_dest(ctx, &alu->dest.dest, &alu->instr, header); + + unsigned dst_components = nir_dest_num_components(alu->dest.dest); + + if (alu->dest.dest.is_ssa) { + alu->dest.write_mask = u_bit_consecutive(0, dst_components); + } else if (dst_components <= 4) { + alu->dest.write_mask = header.alu.writemask_or_two_swizzles; + } else { + alu->dest.write_mask = blob_read_uint32(ctx->blob); + } + + if (header.alu.packed_src_ssa_16bit) { + for (unsigned i = 0; i < num_srcs; i++) { + nir_alu_src *src = &alu->src[i]; + src->src.is_ssa = true; + src->src.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + + memset(&src->swizzle, 0, sizeof(src->swizzle)); + + unsigned src_components = nir_ssa_alu_instr_src_components(alu, i); + + for (unsigned chan = 0; chan < src_components; chan++) + src->swizzle[chan] = chan; } - case nir_deref_type_var: - unreachable("Invalid deref type"); + } else { + for (unsigned i = 0; i < num_srcs; i++) { + union packed_src src = read_src(ctx, &alu->src[i].src, &alu->instr); + unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i); + unsigned src_components = nir_src_num_components(alu->src[i].src); + bool packed = src_components <= 4 && src_channels <= 4; + + alu->src[i].negate = src.alu.negate; + alu->src[i].abs = src.alu.abs; + + memset(&alu->src[i].swizzle, 0, sizeof(alu->src[i].swizzle)); + + if (packed) { + alu->src[i].swizzle[0] = src.alu.swizzle_x; + alu->src[i].swizzle[1] = src.alu.swizzle_y; + alu->src[i].swizzle[2] = src.alu.swizzle_z; + alu->src[i].swizzle[3] = src.alu.swizzle_w; + } else { + /* Load swizzles for vec8 and vec16. */ + for (unsigned o = 0; o < src_channels; o += 8) { + unsigned value = blob_read_uint32(ctx->blob); + + for (unsigned j = 0; j < 8 && o + j < src_channels; j++) { + alu->src[i].swizzle[o + j] = + (value >> (4 * j)) & 0xf; /* 4 bits per swizzle */ + } + } + } } + } - deref->type = decode_type_from_blob(ctx->blob); - - tail->child = deref; - tail = deref; + if (header.alu.packed_src_ssa_16bit && + alu->dest.dest.is_ssa) { + alu->src[0].swizzle[0] = header.alu.writemask_or_two_swizzles & 0x3; + if (num_srcs > 1) + alu->src[1].swizzle[0] = header.alu.writemask_or_two_swizzles >> 2; } - return deref_var; + return alu; } static void -write_alu(write_ctx *ctx, const nir_alu_instr *alu) +write_deref(write_ctx *ctx, const nir_deref_instr *deref) { - blob_write_uint32(ctx->blob, alu->op); - uint32_t flags = alu->exact; - flags |= alu->dest.saturate << 1; - flags |= alu->dest.write_mask << 2; - blob_write_uint32(ctx->blob, flags); + assert(deref->deref_type < 8); + assert(deref->mode < (1 << 10)); + + union packed_instr header; + header.u32 = 0; - write_dest(ctx, &alu->dest.dest); + header.deref.instr_type = deref->instr.type; + header.deref.deref_type = deref->deref_type; - for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { - write_src(ctx, &alu->src[i].src); - flags = alu->src[i].negate; - flags |= alu->src[i].abs << 1; - for (unsigned j = 0; j < 4; j++) - flags |= alu->src[i].swizzle[j] << (2 + 2 * j); - blob_write_uint32(ctx->blob, flags); + if (deref->deref_type == nir_deref_type_cast) { + header.deref.mode = deref->mode; + header.deref.cast_type_same_as_last = deref->type == ctx->last_type; + } + + unsigned var_idx = 0; + if (deref->deref_type == nir_deref_type_var) { + var_idx = write_lookup_object(ctx, deref->var); + if (var_idx && var_idx < (1 << 16)) + header.deref_var.object_idx = var_idx; + } + + if (deref->deref_type == nir_deref_type_array || + deref->deref_type == nir_deref_type_ptr_as_array) { + header.deref.packed_src_ssa_16bit = + deref->parent.is_ssa && deref->arr.index.is_ssa && + are_object_ids_16bit(ctx); + } + + write_dest(ctx, &deref->dest, header, deref->instr.type); + + switch (deref->deref_type) { + case nir_deref_type_var: + if (!header.deref_var.object_idx) + blob_write_uint32(ctx->blob, var_idx); + break; + + case nir_deref_type_struct: + write_src(ctx, &deref->parent); + blob_write_uint32(ctx->blob, deref->strct.index); + break; + + case nir_deref_type_array: + case nir_deref_type_ptr_as_array: + if (header.deref.packed_src_ssa_16bit) { + blob_write_uint16(ctx->blob, + write_lookup_object(ctx, deref->parent.ssa)); + blob_write_uint16(ctx->blob, + write_lookup_object(ctx, deref->arr.index.ssa)); + } else { + write_src(ctx, &deref->parent); + write_src(ctx, &deref->arr.index); + } + break; + + case nir_deref_type_cast: + write_src(ctx, &deref->parent); + blob_write_uint32(ctx->blob, deref->cast.ptr_stride); + if (!header.deref.cast_type_same_as_last) { + encode_type_to_blob(ctx->blob, deref->type); + ctx->last_type = deref->type; + } + break; + + case nir_deref_type_array_wildcard: + write_src(ctx, &deref->parent); + break; + + default: + unreachable("Invalid deref type"); } } -static nir_alu_instr * -read_alu(read_ctx *ctx) +static nir_deref_instr * +read_deref(read_ctx *ctx, union packed_instr header) { - nir_op op = blob_read_uint32(ctx->blob); - nir_alu_instr *alu = nir_alu_instr_create(ctx->nir, op); + nir_deref_type deref_type = header.deref.deref_type; + nir_deref_instr *deref = nir_deref_instr_create(ctx->nir, deref_type); - uint32_t flags = blob_read_uint32(ctx->blob); - alu->exact = flags & 1; - alu->dest.saturate = flags & 2; - alu->dest.write_mask = flags >> 2; - - read_dest(ctx, &alu->dest.dest, &alu->instr); - - for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) { - read_src(ctx, &alu->src[i].src, &alu->instr); - flags = blob_read_uint32(ctx->blob); - alu->src[i].negate = flags & 1; - alu->src[i].abs = flags & 2; - for (unsigned j = 0; j < 4; j++) - alu->src[i].swizzle[j] = (flags >> (2 * j + 2)) & 3; + read_dest(ctx, &deref->dest, &deref->instr, header); + + nir_deref_instr *parent; + + switch (deref->deref_type) { + case nir_deref_type_var: + if (header.deref_var.object_idx) + deref->var = read_lookup_object(ctx, header.deref_var.object_idx); + else + deref->var = read_object(ctx); + + deref->type = deref->var->type; + break; + + case nir_deref_type_struct: + read_src(ctx, &deref->parent, &deref->instr); + parent = nir_src_as_deref(deref->parent); + deref->strct.index = blob_read_uint32(ctx->blob); + deref->type = glsl_get_struct_field(parent->type, deref->strct.index); + break; + + case nir_deref_type_array: + case nir_deref_type_ptr_as_array: + if (header.deref.packed_src_ssa_16bit) { + deref->parent.is_ssa = true; + deref->parent.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + deref->arr.index.is_ssa = true; + deref->arr.index.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + } else { + read_src(ctx, &deref->parent, &deref->instr); + read_src(ctx, &deref->arr.index, &deref->instr); + } + + parent = nir_src_as_deref(deref->parent); + if (deref->deref_type == nir_deref_type_array) + deref->type = glsl_get_array_element(parent->type); + else + deref->type = parent->type; + break; + + case nir_deref_type_cast: + read_src(ctx, &deref->parent, &deref->instr); + deref->cast.ptr_stride = blob_read_uint32(ctx->blob); + if (header.deref.cast_type_same_as_last) { + deref->type = ctx->last_type; + } else { + deref->type = decode_type_from_blob(ctx->blob); + ctx->last_type = deref->type; + } + break; + + case nir_deref_type_array_wildcard: + read_src(ctx, &deref->parent, &deref->instr); + parent = nir_src_as_deref(deref->parent); + deref->type = glsl_get_array_element(parent->type); + break; + + default: + unreachable("Invalid deref type"); } - return alu; + if (deref_type == nir_deref_type_var) { + deref->mode = deref->var->data.mode; + } else if (deref->deref_type == nir_deref_type_cast) { + deref->mode = header.deref.mode; + } else { + assert(deref->parent.is_ssa); + deref->mode = nir_instr_as_deref(deref->parent.ssa->parent_instr)->mode; + } + + return deref; } static void write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin) { - blob_write_uint32(ctx->blob, intrin->intrinsic); - - unsigned num_variables = nir_intrinsic_infos[intrin->intrinsic].num_variables; + /* 9 bits for nir_intrinsic_op */ + STATIC_ASSERT(nir_num_intrinsics <= 512); unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs; unsigned num_indices = nir_intrinsic_infos[intrin->intrinsic].num_indices; + assert(intrin->intrinsic < 512); - blob_write_uint32(ctx->blob, intrin->num_components); + union packed_instr header; + header.u32 = 0; - if (nir_intrinsic_infos[intrin->intrinsic].has_dest) - write_dest(ctx, &intrin->dest); + header.intrinsic.instr_type = intrin->instr.type; + header.intrinsic.intrinsic = intrin->intrinsic; - for (unsigned i = 0; i < num_variables; i++) - write_deref_chain(ctx, intrin->variables[i]); + /* Analyze constant indices to decide how to encode them. */ + if (num_indices) { + unsigned max_bits = 0; + for (unsigned i = 0; i < num_indices; i++) { + unsigned max = util_last_bit(intrin->const_index[i]); + max_bits = MAX2(max_bits, max); + } + + if (max_bits * num_indices <= 9) { + header.intrinsic.const_indices_encoding = const_indices_9bit_all_combined; + + /* Pack all const indices into 6 bits. */ + unsigned bit_size = 9 / num_indices; + for (unsigned i = 0; i < num_indices; i++) { + header.intrinsic.packed_const_indices |= + intrin->const_index[i] << (i * bit_size); + } + } else if (max_bits <= 8) + header.intrinsic.const_indices_encoding = const_indices_8bit; + else if (max_bits <= 16) + header.intrinsic.const_indices_encoding = const_indices_16bit; + else + header.intrinsic.const_indices_encoding = const_indices_32bit; + } + + if (nir_intrinsic_infos[intrin->intrinsic].has_dest) + write_dest(ctx, &intrin->dest, header, intrin->instr.type); + else + blob_write_uint32(ctx->blob, header.u32); for (unsigned i = 0; i < num_srcs; i++) write_src(ctx, &intrin->src[i]); - for (unsigned i = 0; i < num_indices; i++) - blob_write_uint32(ctx->blob, intrin->const_index[i]); + if (num_indices) { + switch (header.intrinsic.const_indices_encoding) { + case const_indices_8bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint8(ctx->blob, intrin->const_index[i]); + break; + case const_indices_16bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint16(ctx->blob, intrin->const_index[i]); + break; + case const_indices_32bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint32(ctx->blob, intrin->const_index[i]); + break; + } + } } static nir_intrinsic_instr * -read_intrinsic(read_ctx *ctx) +read_intrinsic(read_ctx *ctx, union packed_instr header) { - nir_intrinsic_op op = blob_read_uint32(ctx->blob); - + nir_intrinsic_op op = header.intrinsic.intrinsic; nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(ctx->nir, op); - unsigned num_variables = nir_intrinsic_infos[op].num_variables; unsigned num_srcs = nir_intrinsic_infos[op].num_srcs; unsigned num_indices = nir_intrinsic_infos[op].num_indices; - intrin->num_components = blob_read_uint32(ctx->blob); - if (nir_intrinsic_infos[op].has_dest) - read_dest(ctx, &intrin->dest, &intrin->instr); - - for (unsigned i = 0; i < num_variables; i++) - intrin->variables[i] = read_deref_chain(ctx, &intrin->instr); + read_dest(ctx, &intrin->dest, &intrin->instr, header); for (unsigned i = 0; i < num_srcs; i++) read_src(ctx, &intrin->src[i], &intrin->instr); - for (unsigned i = 0; i < num_indices; i++) - intrin->const_index[i] = blob_read_uint32(ctx->blob); + /* Vectorized instrinsics have num_components same as dst or src that has + * 0 components in the info. Find it. + */ + if (nir_intrinsic_infos[op].has_dest && + nir_intrinsic_infos[op].dest_components == 0) { + intrin->num_components = nir_dest_num_components(intrin->dest); + } else { + for (unsigned i = 0; i < num_srcs; i++) { + if (nir_intrinsic_infos[op].src_components[i] == 0) { + intrin->num_components = nir_src_num_components(intrin->src[i]); + break; + } + } + } + + if (num_indices) { + switch (header.intrinsic.const_indices_encoding) { + case const_indices_9bit_all_combined: { + unsigned bit_size = 9 / num_indices; + unsigned bit_mask = u_bit_consecutive(0, bit_size); + for (unsigned i = 0; i < num_indices; i++) { + intrin->const_index[i] = + (header.intrinsic.packed_const_indices >> (i * bit_size)) & + bit_mask; + } + break; + } + case const_indices_8bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint8(ctx->blob); + break; + case const_indices_16bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint16(ctx->blob); + break; + case const_indices_32bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint32(ctx->blob); + break; + } + } return intrin; } @@ -532,22 +1259,155 @@ read_intrinsic(read_ctx *ctx) static void write_load_const(write_ctx *ctx, const nir_load_const_instr *lc) { - uint32_t val = lc->def.num_components; - val |= lc->def.bit_size << 3; - blob_write_uint32(ctx->blob, val); - blob_write_bytes(ctx->blob, (uint8_t *) &lc->value, sizeof(lc->value)); + assert(lc->def.num_components >= 1 && lc->def.num_components <= 16); + union packed_instr header; + header.u32 = 0; + + header.load_const.instr_type = lc->instr.type; + header.load_const.last_component = lc->def.num_components - 1; + header.load_const.bit_size = encode_bit_size_3bits(lc->def.bit_size); + header.load_const.packing = load_const_full; + + /* Try to pack 1-component constants into the 19 free bits in the header. */ + if (lc->def.num_components == 1) { + switch (lc->def.bit_size) { + case 64: + if ((lc->value[0].u64 & 0x1fffffffffffull) == 0) { + /* packed_value contains high 19 bits, low bits are 0 */ + header.load_const.packing = load_const_scalar_hi_19bits; + header.load_const.packed_value = lc->value[0].u64 >> 45; + } else if (((lc->value[0].i64 << 45) >> 45) == lc->value[0].i64) { + /* packed_value contains low 19 bits, high bits are sign-extended */ + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u64; + } + break; + + case 32: + if ((lc->value[0].u32 & 0x1fff) == 0) { + header.load_const.packing = load_const_scalar_hi_19bits; + header.load_const.packed_value = lc->value[0].u32 >> 13; + } else if (((lc->value[0].i32 << 13) >> 13) == lc->value[0].i32) { + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u32; + } + break; + + case 16: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u16; + break; + case 8: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u8; + break; + case 1: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].b; + break; + default: + unreachable("invalid bit_size"); + } + } + + blob_write_uint32(ctx->blob, header.u32); + + if (header.load_const.packing == load_const_full) { + switch (lc->def.bit_size) { + case 64: + blob_write_bytes(ctx->blob, lc->value, + sizeof(*lc->value) * lc->def.num_components); + break; + + case 32: + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint32(ctx->blob, lc->value[i].u32); + break; + + case 16: + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint16(ctx->blob, lc->value[i].u16); + break; + + default: + assert(lc->def.bit_size <= 8); + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint8(ctx->blob, lc->value[i].u8); + break; + } + } + write_add_object(ctx, &lc->def); } static nir_load_const_instr * -read_load_const(read_ctx *ctx) +read_load_const(read_ctx *ctx, union packed_instr header) { - uint32_t val = blob_read_uint32(ctx->blob); - nir_load_const_instr *lc = - nir_load_const_instr_create(ctx->nir, val & 0x7, val >> 3); + nir_load_const_instr_create(ctx->nir, header.load_const.last_component + 1, + decode_bit_size_3bits(header.load_const.bit_size)); + + switch (header.load_const.packing) { + case load_const_scalar_hi_19bits: + switch (lc->def.bit_size) { + case 64: + lc->value[0].u64 = (uint64_t)header.load_const.packed_value << 45; + break; + case 32: + lc->value[0].u32 = (uint64_t)header.load_const.packed_value << 13; + break; + default: + unreachable("invalid bit_size"); + } + break; + + case load_const_scalar_lo_19bits_sext: + switch (lc->def.bit_size) { + case 64: + lc->value[0].i64 = ((int64_t)header.load_const.packed_value << 45) >> 45; + break; + case 32: + lc->value[0].i32 = ((int32_t)header.load_const.packed_value << 13) >> 13; + break; + case 16: + lc->value[0].u16 = header.load_const.packed_value; + break; + case 8: + lc->value[0].u8 = header.load_const.packed_value; + break; + case 1: + lc->value[0].b = header.load_const.packed_value; + break; + default: + unreachable("invalid bit_size"); + } + break; + + case load_const_full: + switch (lc->def.bit_size) { + case 64: + blob_copy_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); + break; + + case 32: + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u32 = blob_read_uint32(ctx->blob); + break; + + case 16: + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u16 = blob_read_uint16(ctx->blob); + break; + + default: + assert(lc->def.bit_size <= 8); + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u8 = blob_read_uint8(ctx->blob); + break; + } + break; + } - blob_copy_bytes(ctx->blob, (uint8_t *) &lc->value, sizeof(lc->value)); read_add_object(ctx, &lc->def); return lc; } @@ -555,19 +1415,25 @@ read_load_const(read_ctx *ctx) static void write_ssa_undef(write_ctx *ctx, const nir_ssa_undef_instr *undef) { - uint32_t val = undef->def.num_components; - val |= undef->def.bit_size << 3; - blob_write_uint32(ctx->blob, val); + assert(undef->def.num_components >= 1 && undef->def.num_components <= 16); + + union packed_instr header; + header.u32 = 0; + + header.undef.instr_type = undef->instr.type; + header.undef.last_component = undef->def.num_components - 1; + header.undef.bit_size = encode_bit_size_3bits(undef->def.bit_size); + + blob_write_uint32(ctx->blob, header.u32); write_add_object(ctx, &undef->def); } static nir_ssa_undef_instr * -read_ssa_undef(read_ctx *ctx) +read_ssa_undef(read_ctx *ctx, union packed_instr header) { - uint32_t val = blob_read_uint32(ctx->blob); - nir_ssa_undef_instr *undef = - nir_ssa_undef_instr_create(ctx->nir, val & 0x7, val >> 3); + nir_ssa_undef_instr_create(ctx->nir, header.undef.last_component + 1, + decode_bit_size_3bits(header.undef.bit_size)); read_add_object(ctx, &undef->def); return undef; @@ -577,25 +1443,37 @@ union packed_tex_data { uint32_t u32; struct { enum glsl_sampler_dim sampler_dim:4; - nir_alu_type dest_type:8; + unsigned dest_type:8; unsigned coord_components:3; unsigned is_array:1; unsigned is_shadow:1; unsigned is_new_style_shadow:1; unsigned component:2; - unsigned has_texture_deref:1; - unsigned has_sampler_deref:1; + unsigned texture_non_uniform:1; + unsigned sampler_non_uniform:1; + unsigned unused:8; /* Mark unused for valgrind. */ } u; }; static void write_tex(write_ctx *ctx, const nir_tex_instr *tex) { - blob_write_uint32(ctx->blob, tex->num_srcs); - blob_write_uint32(ctx->blob, tex->op); + assert(tex->num_srcs < 16); + assert(tex->op < 16); + + union packed_instr header; + header.u32 = 0; + + header.tex.instr_type = tex->instr.type; + header.tex.num_srcs = tex->num_srcs; + header.tex.op = tex->op; + + write_dest(ctx, &tex->dest, header, tex->instr.type); + blob_write_uint32(ctx->blob, tex->texture_index); - blob_write_uint32(ctx->blob, tex->texture_array_size); blob_write_uint32(ctx->blob, tex->sampler_index); + if (tex->op == nir_texop_tg4) + blob_write_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); STATIC_ASSERT(sizeof(union packed_tex_data) == sizeof(uint32_t)); union packed_tex_data packed = { @@ -606,33 +1484,31 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex) .u.is_shadow = tex->is_shadow, .u.is_new_style_shadow = tex->is_new_style_shadow, .u.component = tex->component, - .u.has_texture_deref = tex->texture != NULL, - .u.has_sampler_deref = tex->sampler != NULL, + .u.texture_non_uniform = tex->texture_non_uniform, + .u.sampler_non_uniform = tex->sampler_non_uniform, }; blob_write_uint32(ctx->blob, packed.u32); - write_dest(ctx, &tex->dest); for (unsigned i = 0; i < tex->num_srcs; i++) { - blob_write_uint32(ctx->blob, tex->src[i].src_type); - write_src(ctx, &tex->src[i].src); + union packed_src src; + src.u32 = 0; + src.tex.src_type = tex->src[i].src_type; + write_src_full(ctx, &tex->src[i].src, src); } - - if (tex->texture) - write_deref_chain(ctx, tex->texture); - if (tex->sampler) - write_deref_chain(ctx, tex->sampler); } static nir_tex_instr * -read_tex(read_ctx *ctx) +read_tex(read_ctx *ctx, union packed_instr header) { - unsigned num_srcs = blob_read_uint32(ctx->blob); - nir_tex_instr *tex = nir_tex_instr_create(ctx->nir, num_srcs); + nir_tex_instr *tex = nir_tex_instr_create(ctx->nir, header.tex.num_srcs); - tex->op = blob_read_uint32(ctx->blob); + read_dest(ctx, &tex->dest, &tex->instr, header); + + tex->op = header.tex.op; tex->texture_index = blob_read_uint32(ctx->blob); - tex->texture_array_size = blob_read_uint32(ctx->blob); tex->sampler_index = blob_read_uint32(ctx->blob); + if (tex->op == nir_texop_tg4) + blob_copy_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); union packed_tex_data packed; packed.u32 = blob_read_uint32(ctx->blob); @@ -643,38 +1519,38 @@ read_tex(read_ctx *ctx) tex->is_shadow = packed.u.is_shadow; tex->is_new_style_shadow = packed.u.is_new_style_shadow; tex->component = packed.u.component; + tex->texture_non_uniform = packed.u.texture_non_uniform; + tex->sampler_non_uniform = packed.u.sampler_non_uniform; - read_dest(ctx, &tex->dest, &tex->instr); for (unsigned i = 0; i < tex->num_srcs; i++) { - tex->src[i].src_type = blob_read_uint32(ctx->blob); - read_src(ctx, &tex->src[i].src, &tex->instr); + union packed_src src = read_src(ctx, &tex->src[i].src, &tex->instr); + tex->src[i].src_type = src.tex.src_type; } - tex->texture = packed.u.has_texture_deref ? - read_deref_chain(ctx, &tex->instr) : NULL; - tex->sampler = packed.u.has_sampler_deref ? - read_deref_chain(ctx, &tex->instr) : NULL; - return tex; } static void write_phi(write_ctx *ctx, const nir_phi_instr *phi) { + union packed_instr header; + header.u32 = 0; + + header.phi.instr_type = phi->instr.type; + header.phi.num_srcs = exec_list_length(&phi->srcs); + /* Phi nodes are special, since they may reference SSA definitions and - * basic blocks that don't exist yet. We leave two empty uintptr_t's here, + * basic blocks that don't exist yet. We leave two empty uint32_t's here, * and then store enough information so that a later fixup pass can fill * them in correctly. */ - write_dest(ctx, &phi->dest); - - blob_write_uint32(ctx->blob, exec_list_length(&phi->srcs)); + write_dest(ctx, &phi->dest, header, phi->instr.type); nir_foreach_phi_src(src, phi) { assert(src->src.is_ssa); - size_t blob_offset = blob_reserve_intptr(ctx->blob); - MAYBE_UNUSED size_t blob_offset2 = blob_reserve_intptr(ctx->blob); - assert(blob_offset + sizeof(uintptr_t) == blob_offset2); + size_t blob_offset = blob_reserve_uint32(ctx->blob); + ASSERTED size_t blob_offset2 = blob_reserve_uint32(ctx->blob); + assert(blob_offset + sizeof(uint32_t) == blob_offset2); write_phi_fixup fixup = { .blob_offset = blob_offset, .src = src->src.ssa, @@ -688,7 +1564,7 @@ static void write_fixup_phis(write_ctx *ctx) { util_dynarray_foreach(&ctx->phi_fixups, write_phi_fixup, fixup) { - uintptr_t *blob_ptr = (uintptr_t *)(ctx->blob->data + fixup->blob_offset); + uint32_t *blob_ptr = (uint32_t *)(ctx->blob->data + fixup->blob_offset); blob_ptr[0] = write_lookup_object(ctx, fixup->src); blob_ptr[1] = write_lookup_object(ctx, fixup->block); } @@ -697,13 +1573,11 @@ write_fixup_phis(write_ctx *ctx) } static nir_phi_instr * -read_phi(read_ctx *ctx, nir_block *blk) +read_phi(read_ctx *ctx, nir_block *blk, union packed_instr header) { nir_phi_instr *phi = nir_phi_instr_create(ctx->nir); - read_dest(ctx, &phi->dest, &phi->instr); - - unsigned num_srcs = blob_read_uint32(ctx->blob); + read_dest(ctx, &phi->dest, &phi->instr, header); /* For similar reasons as before, we just store the index directly into the * pointer, and let a later pass resolve the phi sources. @@ -715,12 +1589,12 @@ read_phi(read_ctx *ctx, nir_block *blk) */ nir_instr_insert_after_block(blk, &phi->instr); - for (unsigned i = 0; i < num_srcs; i++) { + for (unsigned i = 0; i < header.phi.num_srcs; i++) { nir_phi_src *src = ralloc(phi, nir_phi_src); src->src.is_ssa = true; - src->src.ssa = (nir_ssa_def *) blob_read_intptr(ctx->blob); - src->pred = (nir_block *) blob_read_intptr(ctx->blob); + src->src.ssa = (nir_ssa_def *)(uintptr_t) blob_read_uint32(ctx->blob); + src->pred = (nir_block *)(uintptr_t) blob_read_uint32(ctx->blob); /* Since we're not letting nir_insert_instr handle use/def stuff for us, * we have to set the parent_instr manually. It doesn't really matter @@ -751,32 +1625,37 @@ read_fixup_phis(read_ctx *ctx) list_addtail(&src->src.use_link, &src->src.ssa->uses); } - assert(list_empty(&ctx->phi_srcs)); + assert(list_is_empty(&ctx->phi_srcs)); } static void write_jump(write_ctx *ctx, const nir_jump_instr *jmp) { - blob_write_uint32(ctx->blob, jmp->type); + assert(jmp->type < 4); + + union packed_instr header; + header.u32 = 0; + + header.jump.instr_type = jmp->instr.type; + header.jump.type = jmp->type; + + blob_write_uint32(ctx->blob, header.u32); } static nir_jump_instr * -read_jump(read_ctx *ctx) +read_jump(read_ctx *ctx, union packed_instr header) { - nir_jump_type type = blob_read_uint32(ctx->blob); - nir_jump_instr *jmp = nir_jump_instr_create(ctx->nir, type); + nir_jump_instr *jmp = nir_jump_instr_create(ctx->nir, header.jump.type); return jmp; } static void write_call(write_ctx *ctx, const nir_call_instr *call) { - blob_write_intptr(ctx->blob, write_lookup_object(ctx, call->callee)); + blob_write_uint32(ctx->blob, write_lookup_object(ctx, call->callee)); for (unsigned i = 0; i < call->num_params; i++) - write_deref_chain(ctx, call->params[i]); - - write_deref_chain(ctx, call->return_deref); + write_src(ctx, &call->params[i]); } static nir_call_instr * @@ -786,9 +1665,7 @@ read_call(read_ctx *ctx) nir_call_instr *call = nir_call_instr_create(ctx->nir, callee); for (unsigned i = 0; i < call->num_params; i++) - call->params[i] = read_deref_chain(ctx, &call->instr); - - call->return_deref = read_deref_chain(ctx, &call->instr); + read_src(ctx, &call->params[i], call); return call; } @@ -796,11 +1673,16 @@ read_call(read_ctx *ctx) static void write_instr(write_ctx *ctx, const nir_instr *instr) { - blob_write_uint32(ctx->blob, instr->type); + /* We have only 4 bits for the instruction type. */ + assert(instr->type < 16); + switch (instr->type) { case nir_instr_type_alu: write_alu(ctx, nir_instr_as_alu(instr)); break; + case nir_instr_type_deref: + write_deref(ctx, nir_instr_as_deref(instr)); + break; case nir_instr_type_intrinsic: write_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; @@ -820,6 +1702,7 @@ write_instr(write_ctx *ctx, const nir_instr *instr) write_jump(ctx, nir_instr_as_jump(instr)); break; case nir_instr_type_call: + blob_write_uint32(ctx->blob, instr->type); write_call(ctx, nir_instr_as_call(instr)); break; case nir_instr_type_parallel_copy: @@ -829,26 +1712,34 @@ write_instr(write_ctx *ctx, const nir_instr *instr) } } -static void +/* Return the number of instructions read. */ +static unsigned read_instr(read_ctx *ctx, nir_block *block) { - nir_instr_type type = blob_read_uint32(ctx->blob); + STATIC_ASSERT(sizeof(union packed_instr) == 4); + union packed_instr header; + header.u32 = blob_read_uint32(ctx->blob); nir_instr *instr; - switch (type) { + + switch (header.any.instr_type) { case nir_instr_type_alu: - instr = &read_alu(ctx)->instr; + for (unsigned i = 0; i <= header.alu.num_followup_alu_sharing_header; i++) + nir_instr_insert_after_block(block, &read_alu(ctx, header)->instr); + return header.alu.num_followup_alu_sharing_header + 1; + case nir_instr_type_deref: + instr = &read_deref(ctx, header)->instr; break; case nir_instr_type_intrinsic: - instr = &read_intrinsic(ctx)->instr; + instr = &read_intrinsic(ctx, header)->instr; break; case nir_instr_type_load_const: - instr = &read_load_const(ctx)->instr; + instr = &read_load_const(ctx, header)->instr; break; case nir_instr_type_ssa_undef: - instr = &read_ssa_undef(ctx)->instr; + instr = &read_ssa_undef(ctx, header)->instr; break; case nir_instr_type_tex: - instr = &read_tex(ctx)->instr; + instr = &read_tex(ctx, header)->instr; break; case nir_instr_type_phi: /* Phi instructions are a bit of a special case when reading because we @@ -856,10 +1747,10 @@ read_instr(read_ctx *ctx, nir_block *block) * for us. Instead, we need to wait until all the blocks/instructions * are read so that we can set their sources up. */ - read_phi(ctx, block); - return; + read_phi(ctx, block, header); + return 1; case nir_instr_type_jump: - instr = &read_jump(ctx)->instr; + instr = &read_jump(ctx, header)->instr; break; case nir_instr_type_call: instr = &read_call(ctx)->instr; @@ -871,6 +1762,7 @@ read_instr(read_ctx *ctx, nir_block *block) } nir_instr_insert_after_block(block, instr); + return 1; } static void @@ -878,8 +1770,14 @@ write_block(write_ctx *ctx, const nir_block *block) { write_add_object(ctx, block); blob_write_uint32(ctx->blob, exec_list_length(&block->instr_list)); - nir_foreach_instr(instr, block) + + ctx->last_instr_type = ~0; + ctx->last_alu_header_offset = 0; + + nir_foreach_instr(instr, block) { write_instr(ctx, instr); + ctx->last_instr_type = instr->type; + } } static void @@ -894,8 +1792,8 @@ read_block(read_ctx *ctx, struct exec_list *cf_list) read_add_object(ctx, block); unsigned num_instrs = blob_read_uint32(ctx->blob); - for (unsigned i = 0; i < num_instrs; i++) { - read_instr(ctx, block); + for (unsigned i = 0; i < num_instrs;) { + i += read_instr(ctx, block); } } @@ -1007,15 +1905,6 @@ write_function_impl(write_ctx *ctx, const nir_function_impl *fi) write_reg_list(ctx, &fi->registers); blob_write_uint32(ctx->blob, fi->reg_alloc); - blob_write_uint32(ctx->blob, fi->num_params); - for (unsigned i = 0; i < fi->num_params; i++) { - write_variable(ctx, fi->params[i]); - } - - blob_write_uint32(ctx->blob, !!(fi->return_var)); - if (fi->return_var) - write_variable(ctx, fi->return_var); - write_cf_list(ctx, &fi->body); write_fixup_phis(ctx); } @@ -1030,17 +1919,6 @@ read_function_impl(read_ctx *ctx, nir_function *fxn) read_reg_list(ctx, &fi->registers); fi->reg_alloc = blob_read_uint32(ctx->blob); - fi->num_params = blob_read_uint32(ctx->blob); - for (unsigned i = 0; i < fi->num_params; i++) { - fi->params[i] = read_variable(ctx); - } - - bool has_return = blob_read_uint32(ctx->blob); - if (has_return) - fi->return_var = read_variable(ctx); - else - fi->return_var = NULL; - read_cf_list(ctx, &fi->body); read_fixup_phis(ctx); @@ -1052,7 +1930,12 @@ read_function_impl(read_ctx *ctx, nir_function *fxn) static void write_function(write_ctx *ctx, const nir_function *fxn) { - blob_write_uint32(ctx->blob, !!(fxn->name)); + uint32_t flags = fxn->is_entrypoint; + if (fxn->name) + flags |= 0x2; + if (fxn->impl) + flags |= 0x4; + blob_write_uint32(ctx->blob, flags); if (fxn->name) blob_write_string(ctx->blob, fxn->name); @@ -1060,12 +1943,12 @@ write_function(write_ctx *ctx, const nir_function *fxn) blob_write_uint32(ctx->blob, fxn->num_params); for (unsigned i = 0; i < fxn->num_params; i++) { - blob_write_uint32(ctx->blob, fxn->params[i].param_type); - encode_type_to_blob(ctx->blob, fxn->params[i].type); + uint32_t val = + ((uint32_t)fxn->params[i].num_components) | + ((uint32_t)fxn->params[i].bit_size) << 8; + blob_write_uint32(ctx->blob, val); } - encode_type_to_blob(ctx->blob, fxn->return_type); - /* At first glance, it looks like we should write the function_impl here. * However, call instructions need to be able to reference at least the * function and those will get processed as we write the function_impls. @@ -1076,7 +1959,8 @@ write_function(write_ctx *ctx, const nir_function *fxn) static void read_function(read_ctx *ctx) { - bool has_name = blob_read_uint32(ctx->blob); + uint32_t flags = blob_read_uint32(ctx->blob); + bool has_name = flags & 0x2; char *name = has_name ? blob_read_string(ctx->blob) : NULL; nir_function *fxn = nir_function_create(ctx->nir, name); @@ -1084,37 +1968,47 @@ read_function(read_ctx *ctx) read_add_object(ctx, fxn); fxn->num_params = blob_read_uint32(ctx->blob); + fxn->params = ralloc_array(fxn, nir_parameter, fxn->num_params); for (unsigned i = 0; i < fxn->num_params; i++) { - fxn->params[i].param_type = blob_read_uint32(ctx->blob); - fxn->params[i].type = decode_type_from_blob(ctx->blob); + uint32_t val = blob_read_uint32(ctx->blob); + fxn->params[i].num_components = val & 0xff; + fxn->params[i].bit_size = (val >> 8) & 0xff; } - fxn->return_type = decode_type_from_blob(ctx->blob); + fxn->is_entrypoint = flags & 0x1; + if (flags & 0x4) + fxn->impl = NIR_SERIALIZE_FUNC_HAS_IMPL; } +/** + * Serialize NIR into a binary blob. + * + * \param strip Don't serialize information only useful for debugging, + * such as variable names, making cache hits from similar + * shaders more likely. + */ void -nir_serialize(struct blob *blob, const nir_shader *nir) +nir_serialize(struct blob *blob, const nir_shader *nir, bool strip) { - write_ctx ctx; - ctx.remap_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - ctx.next_idx = 0; + write_ctx ctx = {0}; + ctx.remap_table = _mesa_pointer_hash_table_create(NULL); ctx.blob = blob; ctx.nir = nir; + ctx.strip = strip; util_dynarray_init(&ctx.phi_fixups, NULL); - size_t idx_size_offset = blob_reserve_intptr(blob); + size_t idx_size_offset = blob_reserve_uint32(blob); struct shader_info info = nir->info; uint32_t strings = 0; - if (info.name) + if (!strip && info.name) strings |= 0x1; - if (info.label) + if (!strip && info.label) strings |= 0x2; blob_write_uint32(blob, strings); - if (info.name) + if (!strip && info.name) blob_write_string(blob, info.name); - if (info.label) + if (!strip && info.label) blob_write_string(blob, info.label); info.name = info.label = NULL; blob_write_bytes(blob, (uint8_t *) &info, sizeof(info)); @@ -1126,12 +2020,11 @@ nir_serialize(struct blob *blob, const nir_shader *nir) write_var_list(&ctx, &nir->globals); write_var_list(&ctx, &nir->system_values); - write_reg_list(&ctx, &nir->registers); - blob_write_uint32(blob, nir->reg_alloc); blob_write_uint32(blob, nir->num_inputs); blob_write_uint32(blob, nir->num_uniforms); blob_write_uint32(blob, nir->num_outputs); blob_write_uint32(blob, nir->num_shared); + blob_write_uint32(blob, nir->scratch_size); blob_write_uint32(blob, exec_list_length(&nir->functions)); nir_foreach_function(fxn, nir) { @@ -1139,10 +2032,15 @@ nir_serialize(struct blob *blob, const nir_shader *nir) } nir_foreach_function(fxn, nir) { - write_function_impl(&ctx, fxn->impl); + if (fxn->impl) + write_function_impl(&ctx, fxn->impl); } - *(uintptr_t *)(blob->data + idx_size_offset) = ctx.next_idx; + blob_write_uint32(blob, nir->constant_data_size); + if (nir->constant_data_size > 0) + blob_write_bytes(blob, nir->constant_data, nir->constant_data_size); + + *(uint32_t *)(blob->data + idx_size_offset) = ctx.next_idx; _mesa_hash_table_destroy(ctx.remap_table, NULL); util_dynarray_fini(&ctx.phi_fixups); @@ -1153,12 +2051,11 @@ nir_deserialize(void *mem_ctx, const struct nir_shader_compiler_options *options, struct blob_reader *blob) { - read_ctx ctx; + read_ctx ctx = {0}; ctx.blob = blob; list_inithead(&ctx.phi_srcs); - ctx.idx_table_len = blob_read_intptr(blob); + ctx.idx_table_len = blob_read_uint32(blob); ctx.idx_table = calloc(ctx.idx_table_len, sizeof(uintptr_t)); - ctx.next_idx = 0; uint32_t strings = blob_read_uint32(blob); char *name = (strings & 0x1) ? blob_read_string(blob) : NULL; @@ -1181,40 +2078,56 @@ nir_deserialize(void *mem_ctx, read_var_list(&ctx, &ctx.nir->globals); read_var_list(&ctx, &ctx.nir->system_values); - read_reg_list(&ctx, &ctx.nir->registers); - ctx.nir->reg_alloc = blob_read_uint32(blob); ctx.nir->num_inputs = blob_read_uint32(blob); ctx.nir->num_uniforms = blob_read_uint32(blob); ctx.nir->num_outputs = blob_read_uint32(blob); ctx.nir->num_shared = blob_read_uint32(blob); + ctx.nir->scratch_size = blob_read_uint32(blob); unsigned num_functions = blob_read_uint32(blob); for (unsigned i = 0; i < num_functions; i++) read_function(&ctx); - nir_foreach_function(fxn, ctx.nir) - fxn->impl = read_function_impl(&ctx, fxn); + nir_foreach_function(fxn, ctx.nir) { + if (fxn->impl == NIR_SERIALIZE_FUNC_HAS_IMPL) + fxn->impl = read_function_impl(&ctx, fxn); + } + + ctx.nir->constant_data_size = blob_read_uint32(blob); + if (ctx.nir->constant_data_size > 0) { + ctx.nir->constant_data = + ralloc_size(ctx.nir, ctx.nir->constant_data_size); + blob_copy_bytes(blob, ctx.nir->constant_data, + ctx.nir->constant_data_size); + } free(ctx.idx_table); return ctx.nir; } -nir_shader * -nir_shader_serialize_deserialize(void *mem_ctx, nir_shader *s) +void +nir_shader_serialize_deserialize(nir_shader *shader) { - const struct nir_shader_compiler_options *options = s->options; + const struct nir_shader_compiler_options *options = shader->options; struct blob writer; blob_init(&writer); - nir_serialize(&writer, s); - ralloc_free(s); + nir_serialize(&writer, shader, false); + + /* Delete all of dest's ralloc children but leave dest alone */ + void *dead_ctx = ralloc_context(NULL); + ralloc_adopt(dead_ctx, shader); + ralloc_free(dead_ctx); + + dead_ctx = ralloc_context(NULL); struct blob_reader reader; blob_reader_init(&reader, writer.data, writer.size); - nir_shader *ns = nir_deserialize(mem_ctx, options, &reader); + nir_shader *copy = nir_deserialize(dead_ctx, options, &reader); blob_finish(&writer); - return ns; + nir_shader_replace(shader, copy); + ralloc_free(dead_ctx); }