X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fcompiler%2Fnir%2Fnir_serialize.c;h=6688e9e8b172260ed05986b4f7052238774601e6;hb=111b0a669979cf277f31c69f501982fee004e067;hp=ee783284467c5769de0e2f37798a0825dad8ee9a;hpb=c8314678ee95b8eff211cbdd2086e3197a7ef696;p=mesa.git diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c index ee783284467..6688e9e8b17 100644 --- a/src/compiler/nir/nir_serialize.c +++ b/src/compiler/nir/nir_serialize.c @@ -54,6 +54,11 @@ typedef struct { /* The last serialized type. */ const struct glsl_type *last_type; const struct glsl_type *last_interface_type; + struct nir_variable_data last_var_data; + + /* For skipping equal ALU headers (typical after scalarization). */ + nir_instr_type last_instr_type; + uintptr_t last_alu_header_offset; /* Don't write optional data such as variable names. */ bool strip; @@ -79,6 +84,7 @@ typedef struct { /* The last deserialized type. */ const struct glsl_type *last_type; const struct glsl_type *last_interface_type; + struct nir_variable_data last_var_data; } read_ctx; static void @@ -97,12 +103,6 @@ write_lookup_object(write_ctx *ctx, const void *obj) return (uint32_t)(uintptr_t) entry->data; } -static void -write_object(write_ctx *ctx, const void *obj) -{ - blob_write_uint32(ctx->blob, write_lookup_object(ctx, obj)); -} - static void read_add_object(read_ctx *ctx, void *obj) { @@ -141,6 +141,8 @@ decode_bit_size_3bits(uint8_t bit_size) return 0; } +#define NUM_COMPONENTS_IS_SEPARATE_7 7 + static uint8_t encode_num_components_in_3bits(uint8_t num_components) { @@ -151,8 +153,8 @@ encode_num_components_in_3bits(uint8_t num_components) if (num_components == 16) return 6; - unreachable("invalid number in num_components"); - return 0; + /* special value indicating that num_components is in the next uint32 */ + return NUM_COMPONENTS_IS_SEPARATE_7; } static uint8_t @@ -196,6 +198,7 @@ enum var_data_encoding { var_encode_full, var_encode_shader_temp, var_encode_function_temp, + var_encode_location_diff, }; union packed_var { @@ -203,23 +206,32 @@ union packed_var { struct { unsigned has_name:1; unsigned has_constant_initializer:1; + unsigned has_pointer_initializer:1; unsigned has_interface_type:1; unsigned num_state_slots:7; unsigned data_encoding:2; unsigned type_same_as_last:1; unsigned interface_type_same_as_last:1; - unsigned _pad:2; + unsigned _pad:1; unsigned num_members:16; } u; }; +union packed_var_data_diff { + uint32_t u32; + struct { + int location:13; + int location_frac:3; + int driver_location:16; + } u; +}; + static void write_variable(write_ctx *ctx, const nir_variable *var) { write_add_object(ctx, var); assert(var->num_state_slots < (1 << 7)); - assert(var->num_members < (1 << 16)); STATIC_ASSERT(sizeof(union packed_var) == 4); union packed_var flags; @@ -227,6 +239,7 @@ write_variable(write_ctx *ctx, const nir_variable *var) flags.u.has_name = !ctx->strip && var->name; flags.u.has_constant_initializer = !!(var->constant_initializer); + flags.u.has_pointer_initializer = !!(var->pointer_initializer); flags.u.has_interface_type = !!(var->interface_type); flags.u.type_same_as_last = var->type == ctx->last_type; flags.u.interface_type_same_as_last = @@ -234,13 +247,40 @@ write_variable(write_ctx *ctx, const nir_variable *var) flags.u.num_state_slots = var->num_state_slots; flags.u.num_members = var->num_members; + struct nir_variable_data data = var->data; + + /* When stripping, we expect that the location is no longer needed, + * which is typically after shaders are linked. + */ + if (ctx->strip && + data.mode != nir_var_shader_in && + data.mode != nir_var_shader_out) + data.location = 0; + /* Temporary variables don't serialize var->data. */ - if (var->data.mode == nir_var_shader_temp) + if (data.mode == nir_var_shader_temp) flags.u.data_encoding = var_encode_shader_temp; - else if (var->data.mode == nir_var_function_temp) + else if (data.mode == nir_var_function_temp) flags.u.data_encoding = var_encode_function_temp; - else - flags.u.data_encoding = var_encode_full; + else { + struct nir_variable_data tmp = data; + + tmp.location = ctx->last_var_data.location; + tmp.location_frac = ctx->last_var_data.location_frac; + tmp.driver_location = ctx->last_var_data.driver_location; + + /* See if we can encode only the difference in locations from the last + * variable. + */ + if (memcmp(&ctx->last_var_data, &tmp, sizeof(tmp)) == 0 && + abs((int)data.location - + (int)ctx->last_var_data.location) < (1 << 12) && + abs((int)data.driver_location - + (int)ctx->last_var_data.driver_location) < (1 << 15)) + flags.u.data_encoding = var_encode_location_diff; + else + flags.u.data_encoding = var_encode_full; + } blob_write_uint32(ctx->blob, flags.u32); @@ -257,18 +297,25 @@ write_variable(write_ctx *ctx, const nir_variable *var) if (flags.u.has_name) blob_write_string(ctx->blob, var->name); - if (flags.u.data_encoding == var_encode_full) { - struct nir_variable_data data = var->data; + if (flags.u.data_encoding == var_encode_full || + flags.u.data_encoding == var_encode_location_diff) { + if (flags.u.data_encoding == var_encode_full) { + blob_write_bytes(ctx->blob, &data, sizeof(data)); + } else { + /* Serialize only the difference in locations from the last variable. + */ + union packed_var_data_diff diff; - /* When stripping, we expect that the location is no longer needed, - * which is typically after shaders are linked. - */ - if (ctx->strip && - data.mode != nir_var_shader_in && - data.mode != nir_var_shader_out) - data.location = 0; + diff.u.location = data.location - ctx->last_var_data.location; + diff.u.location_frac = data.location_frac - + ctx->last_var_data.location_frac; + diff.u.driver_location = data.driver_location - + ctx->last_var_data.driver_location; + + blob_write_uint32(ctx->blob, diff.u32); + } - blob_write_bytes(ctx->blob, &data, sizeof(data)); + ctx->last_var_data = data; } for (unsigned i = 0; i < var->num_state_slots; i++) { @@ -277,6 +324,8 @@ write_variable(write_ctx *ctx, const nir_variable *var) } if (var->constant_initializer) write_constant(ctx, var->constant_initializer); + if (var->pointer_initializer) + write_lookup_object(ctx, var->pointer_initializer); if (var->num_members > 0) { blob_write_bytes(ctx->blob, (uint8_t *) var->members, var->num_members * sizeof(*var->members)); @@ -319,8 +368,20 @@ read_variable(read_ctx *ctx) var->data.mode = nir_var_shader_temp; else if (flags.u.data_encoding == var_encode_function_temp) var->data.mode = nir_var_function_temp; - else + else if (flags.u.data_encoding == var_encode_full) { blob_copy_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); + ctx->last_var_data = var->data; + } else { /* var_encode_location_diff */ + union packed_var_data_diff diff; + diff.u32 = blob_read_uint32(ctx->blob); + + var->data = ctx->last_var_data; + var->data.location += diff.u.location; + var->data.location_frac += diff.u.location_frac; + var->data.driver_location += diff.u.driver_location; + + ctx->last_var_data = var->data; + } var->num_state_slots = flags.u.num_state_slots; if (var->num_state_slots != 0) { @@ -335,6 +396,12 @@ read_variable(read_ctx *ctx) var->constant_initializer = read_constant(ctx, var); else var->constant_initializer = NULL; + + if (flags.u.has_pointer_initializer) + var->pointer_initializer = read_object(ctx); + else + var->pointer_initializer = NULL; + var->num_members = flags.u.num_members; if (var->num_members > 0) { var->members = ralloc_array(var, struct nir_variable_data, @@ -516,6 +583,34 @@ union packed_dest { } reg; }; +enum intrinsic_const_indices_encoding { + /* Use the 9 bits of packed_const_indices to store 1-9 indices. + * 1 9-bit index, or 2 4-bit indices, or 3 3-bit indices, or + * 4 2-bit indices, or 5-9 1-bit indices. + * + * The common case for load_ubo is 0, 0, 0, which is trivially represented. + * The common cases for load_interpolated_input also fit here, e.g.: 7, 3 + */ + const_indices_9bit_all_combined, + + const_indices_8bit, /* 8 bits per element */ + const_indices_16bit, /* 16 bits per element */ + const_indices_32bit, /* 32 bits per element */ +}; + +enum load_const_packing { + /* Constants are not packed and are stored in following dwords. */ + load_const_full, + + /* packed_value contains high 19 bits, low bits are 0, + * good for floating-point decimals + */ + load_const_scalar_hi_19bits, + + /* packed_value contains low 19 bits, high bits are sign-extended */ + load_const_scalar_lo_19bits_sext, +}; + union packed_instr { uint32_t u32; struct { @@ -529,30 +624,43 @@ union packed_instr { unsigned no_signed_wrap:1; unsigned no_unsigned_wrap:1; unsigned saturate:1; - unsigned writemask:4; + /* Reg: writemask; SSA: swizzles for 2 srcs */ + unsigned writemask_or_two_swizzles:4; unsigned op:9; - unsigned _pad:3; + unsigned packed_src_ssa_16bit:1; + /* Scalarized ALUs always have the same header. */ + unsigned num_followup_alu_sharing_header:2; unsigned dest:8; } alu; struct { unsigned instr_type:4; unsigned deref_type:3; - unsigned mode:10; - unsigned _pad:7; + unsigned cast_type_same_as_last:1; + unsigned mode:10; /* deref_var redefines this */ + unsigned packed_src_ssa_16bit:1; /* deref_var redefines this */ + unsigned _pad:5; /* deref_var redefines this */ unsigned dest:8; } deref; + struct { + unsigned instr_type:4; + unsigned deref_type:3; + unsigned _pad:1; + unsigned object_idx:16; /* if 0, the object ID is a separate uint32 */ + unsigned dest:8; + } deref_var; struct { unsigned instr_type:4; unsigned intrinsic:9; - unsigned num_components:3; - unsigned _pad:8; + unsigned const_indices_encoding:2; + unsigned packed_const_indices:9; unsigned dest:8; } intrinsic; struct { unsigned instr_type:4; unsigned last_component:4; unsigned bit_size:3; - unsigned _pad:21; + unsigned packing:2; /* enum load_const_packing */ + unsigned packed_value:19; /* meaning determined by packing */ } load_const; struct { unsigned instr_type:4; @@ -564,8 +672,8 @@ union packed_instr { unsigned instr_type:4; unsigned num_srcs:4; unsigned op:4; - unsigned texture_array_size:12; unsigned dest:8; + unsigned _pad:12; } tex; struct { unsigned instr_type:4; @@ -581,7 +689,8 @@ union packed_instr { /* Write "lo24" as low 24 bits in the first uint32. */ static void -write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header) +write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header, + nir_instr_type instr_type) { STATIC_ASSERT(sizeof(union packed_dest) == 1); union packed_dest dest; @@ -596,9 +705,47 @@ write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header) } else { dest.reg.is_indirect = !!(dst->reg.indirect); } - header.any.dest = dest.u8; - blob_write_uint32(ctx->blob, header.u32); + + /* Check if the current ALU instruction has the same header as the previous + * instruction that is also ALU. If it is, we don't have to write + * the current header. This is a typical occurence after scalarization. + */ + if (instr_type == nir_instr_type_alu) { + bool equal_header = false; + + if (ctx->last_instr_type == nir_instr_type_alu) { + assert(ctx->last_alu_header_offset); + union packed_instr *last_header = + (union packed_instr *)(ctx->blob->data + + ctx->last_alu_header_offset); + + /* Clear the field that counts ALUs with equal headers. */ + union packed_instr clean_header; + clean_header.u32 = last_header->u32; + clean_header.alu.num_followup_alu_sharing_header = 0; + + /* There can be at most 4 consecutive ALU instructions + * sharing the same header. + */ + if (last_header->alu.num_followup_alu_sharing_header < 3 && + header.u32 == clean_header.u32) { + last_header->alu.num_followup_alu_sharing_header++; + equal_header = true; + } + } + + if (!equal_header) { + ctx->last_alu_header_offset = ctx->blob->size; + blob_write_uint32(ctx->blob, header.u32); + } + } else { + blob_write_uint32(ctx->blob, header.u32); + } + + if (dest.ssa.is_ssa && + dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7) + blob_write_uint32(ctx->blob, dst->ssa.num_components); if (dst->is_ssa) { write_add_object(ctx, &dst->ssa); @@ -621,8 +768,11 @@ read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr, if (dest.ssa.is_ssa) { unsigned bit_size = decode_bit_size_3bits(dest.ssa.bit_size); - unsigned num_components = - decode_num_components_in_3bits(dest.ssa.num_components); + unsigned num_components; + if (dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7) + num_components = blob_read_uint32(ctx->blob); + else + num_components = decode_num_components_in_3bits(dest.ssa.num_components); char *name = dest.ssa.has_name ? blob_read_string(ctx->blob) : NULL; nir_ssa_dest_init(instr, dst, num_components, bit_size, name); read_add_object(ctx, &dst->ssa); @@ -636,9 +786,46 @@ read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr, } } +static bool +are_object_ids_16bit(write_ctx *ctx) +{ + /* Check the highest object ID, because they are monotonic. */ + return ctx->next_idx < (1 << 16); +} + +static bool +is_alu_src_ssa_16bit(write_ctx *ctx, const nir_alu_instr *alu) +{ + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + + for (unsigned i = 0; i < num_srcs; i++) { + if (!alu->src[i].src.is_ssa || alu->src[i].abs || alu->src[i].negate) + return false; + + unsigned src_components = nir_ssa_alu_instr_src_components(alu, i); + + for (unsigned chan = 0; chan < src_components; chan++) { + /* The swizzles for src0.x and src1.x are stored + * in writemask_or_two_swizzles for SSA ALUs. + */ + if (alu->dest.dest.is_ssa && i < 2 && chan == 0 && + alu->src[i].swizzle[chan] < 4) + continue; + + if (alu->src[i].swizzle[chan] != chan) + return false; + } + } + + return are_object_ids_16bit(ctx); +} + static void write_alu(write_ctx *ctx, const nir_alu_instr *alu) { + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + unsigned dst_components = nir_dest_num_components(alu->dest.dest); + /* 9 bits for nir_op */ STATIC_ASSERT(nir_num_opcodes <= 512); union packed_instr header; @@ -649,48 +836,141 @@ write_alu(write_ctx *ctx, const nir_alu_instr *alu) header.alu.no_signed_wrap = alu->no_signed_wrap; header.alu.no_unsigned_wrap = alu->no_unsigned_wrap; header.alu.saturate = alu->dest.saturate; - header.alu.writemask = alu->dest.write_mask; header.alu.op = alu->op; + header.alu.packed_src_ssa_16bit = is_alu_src_ssa_16bit(ctx, alu); + + if (header.alu.packed_src_ssa_16bit && + alu->dest.dest.is_ssa) { + /* For packed srcs of SSA ALUs, this field stores the swizzles. */ + header.alu.writemask_or_two_swizzles = alu->src[0].swizzle[0]; + if (num_srcs > 1) + header.alu.writemask_or_two_swizzles |= alu->src[1].swizzle[0] << 2; + } else if (!alu->dest.dest.is_ssa && dst_components <= 4) { + /* For vec4 registers, this field is a writemask. */ + header.alu.writemask_or_two_swizzles = alu->dest.write_mask; + } - write_dest(ctx, &alu->dest.dest, header); + write_dest(ctx, &alu->dest.dest, header, alu->instr.type); - for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { - union packed_src src; - src.u32 = 0; + if (!alu->dest.dest.is_ssa && dst_components > 4) + blob_write_uint32(ctx->blob, alu->dest.write_mask); - src.alu.negate = alu->src[i].negate; - src.alu.abs = alu->src[i].abs; - src.alu.swizzle_x = alu->src[i].swizzle[0]; - src.alu.swizzle_y = alu->src[i].swizzle[1]; - src.alu.swizzle_z = alu->src[i].swizzle[2]; - src.alu.swizzle_w = alu->src[i].swizzle[3]; - - write_src_full(ctx, &alu->src[i].src, src); + if (header.alu.packed_src_ssa_16bit) { + for (unsigned i = 0; i < num_srcs; i++) { + assert(alu->src[i].src.is_ssa); + unsigned idx = write_lookup_object(ctx, alu->src[i].src.ssa); + assert(idx < (1 << 16)); + blob_write_uint16(ctx->blob, idx); + } + } else { + for (unsigned i = 0; i < num_srcs; i++) { + unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i); + unsigned src_components = nir_src_num_components(alu->src[i].src); + union packed_src src; + bool packed = src_components <= 4 && src_channels <= 4; + src.u32 = 0; + + src.alu.negate = alu->src[i].negate; + src.alu.abs = alu->src[i].abs; + + if (packed) { + src.alu.swizzle_x = alu->src[i].swizzle[0]; + src.alu.swizzle_y = alu->src[i].swizzle[1]; + src.alu.swizzle_z = alu->src[i].swizzle[2]; + src.alu.swizzle_w = alu->src[i].swizzle[3]; + } + + write_src_full(ctx, &alu->src[i].src, src); + + /* Store swizzles for vec8 and vec16. */ + if (!packed) { + for (unsigned o = 0; o < src_channels; o += 8) { + unsigned value = 0; + + for (unsigned j = 0; j < 8 && o + j < src_channels; j++) { + value |= (uint32_t)alu->src[i].swizzle[o + j] << + (4 * j); /* 4 bits per swizzle */ + } + + blob_write_uint32(ctx->blob, value); + } + } + } } } static nir_alu_instr * read_alu(read_ctx *ctx, union packed_instr header) { + unsigned num_srcs = nir_op_infos[header.alu.op].num_inputs; nir_alu_instr *alu = nir_alu_instr_create(ctx->nir, header.alu.op); alu->exact = header.alu.exact; alu->no_signed_wrap = header.alu.no_signed_wrap; alu->no_unsigned_wrap = header.alu.no_unsigned_wrap; alu->dest.saturate = header.alu.saturate; - alu->dest.write_mask = header.alu.writemask; read_dest(ctx, &alu->dest.dest, &alu->instr, header); - for (unsigned i = 0; i < nir_op_infos[header.alu.op].num_inputs; i++) { - union packed_src src = read_src(ctx, &alu->src[i].src, &alu->instr); + unsigned dst_components = nir_dest_num_components(alu->dest.dest); - alu->src[i].negate = src.alu.negate; - alu->src[i].abs = src.alu.abs; - alu->src[i].swizzle[0] = src.alu.swizzle_x; - alu->src[i].swizzle[1] = src.alu.swizzle_y; - alu->src[i].swizzle[2] = src.alu.swizzle_z; - alu->src[i].swizzle[3] = src.alu.swizzle_w; + if (alu->dest.dest.is_ssa) { + alu->dest.write_mask = u_bit_consecutive(0, dst_components); + } else if (dst_components <= 4) { + alu->dest.write_mask = header.alu.writemask_or_two_swizzles; + } else { + alu->dest.write_mask = blob_read_uint32(ctx->blob); + } + + if (header.alu.packed_src_ssa_16bit) { + for (unsigned i = 0; i < num_srcs; i++) { + nir_alu_src *src = &alu->src[i]; + src->src.is_ssa = true; + src->src.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + + memset(&src->swizzle, 0, sizeof(src->swizzle)); + + unsigned src_components = nir_ssa_alu_instr_src_components(alu, i); + + for (unsigned chan = 0; chan < src_components; chan++) + src->swizzle[chan] = chan; + } + } else { + for (unsigned i = 0; i < num_srcs; i++) { + union packed_src src = read_src(ctx, &alu->src[i].src, &alu->instr); + unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i); + unsigned src_components = nir_src_num_components(alu->src[i].src); + bool packed = src_components <= 4 && src_channels <= 4; + + alu->src[i].negate = src.alu.negate; + alu->src[i].abs = src.alu.abs; + + memset(&alu->src[i].swizzle, 0, sizeof(alu->src[i].swizzle)); + + if (packed) { + alu->src[i].swizzle[0] = src.alu.swizzle_x; + alu->src[i].swizzle[1] = src.alu.swizzle_y; + alu->src[i].swizzle[2] = src.alu.swizzle_z; + alu->src[i].swizzle[3] = src.alu.swizzle_w; + } else { + /* Load swizzles for vec8 and vec16. */ + for (unsigned o = 0; o < src_channels; o += 8) { + unsigned value = blob_read_uint32(ctx->blob); + + for (unsigned j = 0; j < 8 && o + j < src_channels; j++) { + alu->src[i].swizzle[o + j] = + (value >> (4 * j)) & 0xf; /* 4 bits per swizzle */ + } + } + } + } + } + + if (header.alu.packed_src_ssa_16bit && + alu->dest.dest.is_ssa) { + alu->src[0].swizzle[0] = header.alu.writemask_or_two_swizzles & 0x3; + if (num_srcs > 1) + alu->src[1].swizzle[0] = header.alu.writemask_or_two_swizzles >> 2; } return alu; @@ -707,34 +987,63 @@ write_deref(write_ctx *ctx, const nir_deref_instr *deref) header.deref.instr_type = deref->instr.type; header.deref.deref_type = deref->deref_type; - header.deref.mode = deref->mode; - write_dest(ctx, &deref->dest, header); - encode_type_to_blob(ctx->blob, deref->type); + if (deref->deref_type == nir_deref_type_cast) { + header.deref.mode = deref->mode; + header.deref.cast_type_same_as_last = deref->type == ctx->last_type; + } + unsigned var_idx = 0; if (deref->deref_type == nir_deref_type_var) { - write_object(ctx, deref->var); - return; + var_idx = write_lookup_object(ctx, deref->var); + if (var_idx && var_idx < (1 << 16)) + header.deref_var.object_idx = var_idx; + } + + if (deref->deref_type == nir_deref_type_array || + deref->deref_type == nir_deref_type_ptr_as_array) { + header.deref.packed_src_ssa_16bit = + deref->parent.is_ssa && deref->arr.index.is_ssa && + are_object_ids_16bit(ctx); } - write_src(ctx, &deref->parent); + write_dest(ctx, &deref->dest, header, deref->instr.type); switch (deref->deref_type) { + case nir_deref_type_var: + if (!header.deref_var.object_idx) + blob_write_uint32(ctx->blob, var_idx); + break; + case nir_deref_type_struct: + write_src(ctx, &deref->parent); blob_write_uint32(ctx->blob, deref->strct.index); break; case nir_deref_type_array: case nir_deref_type_ptr_as_array: - write_src(ctx, &deref->arr.index); + if (header.deref.packed_src_ssa_16bit) { + blob_write_uint16(ctx->blob, + write_lookup_object(ctx, deref->parent.ssa)); + blob_write_uint16(ctx->blob, + write_lookup_object(ctx, deref->arr.index.ssa)); + } else { + write_src(ctx, &deref->parent); + write_src(ctx, &deref->arr.index); + } break; case nir_deref_type_cast: + write_src(ctx, &deref->parent); blob_write_uint32(ctx->blob, deref->cast.ptr_stride); + if (!header.deref.cast_type_same_as_last) { + encode_type_to_blob(ctx->blob, deref->type); + ctx->last_type = deref->type; + } break; case nir_deref_type_array_wildcard: - /* Nothing to do */ + write_src(ctx, &deref->parent); break; default: @@ -750,38 +1059,74 @@ read_deref(read_ctx *ctx, union packed_instr header) read_dest(ctx, &deref->dest, &deref->instr, header); - deref->mode = header.deref.mode; - deref->type = decode_type_from_blob(ctx->blob); + nir_deref_instr *parent; - if (deref_type == nir_deref_type_var) { - deref->var = read_object(ctx); - return deref; - } + switch (deref->deref_type) { + case nir_deref_type_var: + if (header.deref_var.object_idx) + deref->var = read_lookup_object(ctx, header.deref_var.object_idx); + else + deref->var = read_object(ctx); - read_src(ctx, &deref->parent, &deref->instr); + deref->type = deref->var->type; + break; - switch (deref->deref_type) { case nir_deref_type_struct: + read_src(ctx, &deref->parent, &deref->instr); + parent = nir_src_as_deref(deref->parent); deref->strct.index = blob_read_uint32(ctx->blob); + deref->type = glsl_get_struct_field(parent->type, deref->strct.index); break; case nir_deref_type_array: case nir_deref_type_ptr_as_array: - read_src(ctx, &deref->arr.index, &deref->instr); + if (header.deref.packed_src_ssa_16bit) { + deref->parent.is_ssa = true; + deref->parent.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + deref->arr.index.is_ssa = true; + deref->arr.index.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + } else { + read_src(ctx, &deref->parent, &deref->instr); + read_src(ctx, &deref->arr.index, &deref->instr); + } + + parent = nir_src_as_deref(deref->parent); + if (deref->deref_type == nir_deref_type_array) + deref->type = glsl_get_array_element(parent->type); + else + deref->type = parent->type; break; case nir_deref_type_cast: + read_src(ctx, &deref->parent, &deref->instr); deref->cast.ptr_stride = blob_read_uint32(ctx->blob); + if (header.deref.cast_type_same_as_last) { + deref->type = ctx->last_type; + } else { + deref->type = decode_type_from_blob(ctx->blob); + ctx->last_type = deref->type; + } break; case nir_deref_type_array_wildcard: - /* Nothing to do */ + read_src(ctx, &deref->parent, &deref->instr); + parent = nir_src_as_deref(deref->parent); + deref->type = glsl_get_array_element(parent->type); break; default: unreachable("Invalid deref type"); } + if (deref_type == nir_deref_type_var) { + deref->mode = deref->var->data.mode; + } else if (deref->deref_type == nir_deref_type_cast) { + deref->mode = header.deref.mode; + } else { + assert(deref->parent.is_ssa); + deref->mode = nir_instr_as_deref(deref->parent.ssa->parent_instr)->mode; + } + return deref; } @@ -799,19 +1144,56 @@ write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin) header.intrinsic.instr_type = intrin->instr.type; header.intrinsic.intrinsic = intrin->intrinsic; - header.intrinsic.num_components = - encode_num_components_in_3bits(intrin->num_components); + + /* Analyze constant indices to decide how to encode them. */ + if (num_indices) { + unsigned max_bits = 0; + for (unsigned i = 0; i < num_indices; i++) { + unsigned max = util_last_bit(intrin->const_index[i]); + max_bits = MAX2(max_bits, max); + } + + if (max_bits * num_indices <= 9) { + header.intrinsic.const_indices_encoding = const_indices_9bit_all_combined; + + /* Pack all const indices into 6 bits. */ + unsigned bit_size = 9 / num_indices; + for (unsigned i = 0; i < num_indices; i++) { + header.intrinsic.packed_const_indices |= + intrin->const_index[i] << (i * bit_size); + } + } else if (max_bits <= 8) + header.intrinsic.const_indices_encoding = const_indices_8bit; + else if (max_bits <= 16) + header.intrinsic.const_indices_encoding = const_indices_16bit; + else + header.intrinsic.const_indices_encoding = const_indices_32bit; + } if (nir_intrinsic_infos[intrin->intrinsic].has_dest) - write_dest(ctx, &intrin->dest, header); + write_dest(ctx, &intrin->dest, header, intrin->instr.type); else blob_write_uint32(ctx->blob, header.u32); for (unsigned i = 0; i < num_srcs; i++) write_src(ctx, &intrin->src[i]); - for (unsigned i = 0; i < num_indices; i++) - blob_write_uint32(ctx->blob, intrin->const_index[i]); + if (num_indices) { + switch (header.intrinsic.const_indices_encoding) { + case const_indices_8bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint8(ctx->blob, intrin->const_index[i]); + break; + case const_indices_16bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint16(ctx->blob, intrin->const_index[i]); + break; + case const_indices_32bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint32(ctx->blob, intrin->const_index[i]); + break; + } + } } static nir_intrinsic_instr * @@ -823,17 +1205,53 @@ read_intrinsic(read_ctx *ctx, union packed_instr header) unsigned num_srcs = nir_intrinsic_infos[op].num_srcs; unsigned num_indices = nir_intrinsic_infos[op].num_indices; - intrin->num_components = - decode_num_components_in_3bits(header.intrinsic.num_components); - if (nir_intrinsic_infos[op].has_dest) read_dest(ctx, &intrin->dest, &intrin->instr, header); for (unsigned i = 0; i < num_srcs; i++) read_src(ctx, &intrin->src[i], &intrin->instr); - for (unsigned i = 0; i < num_indices; i++) - intrin->const_index[i] = blob_read_uint32(ctx->blob); + /* Vectorized instrinsics have num_components same as dst or src that has + * 0 components in the info. Find it. + */ + if (nir_intrinsic_infos[op].has_dest && + nir_intrinsic_infos[op].dest_components == 0) { + intrin->num_components = nir_dest_num_components(intrin->dest); + } else { + for (unsigned i = 0; i < num_srcs; i++) { + if (nir_intrinsic_infos[op].src_components[i] == 0) { + intrin->num_components = nir_src_num_components(intrin->src[i]); + break; + } + } + } + + if (num_indices) { + switch (header.intrinsic.const_indices_encoding) { + case const_indices_9bit_all_combined: { + unsigned bit_size = 9 / num_indices; + unsigned bit_mask = u_bit_consecutive(0, bit_size); + for (unsigned i = 0; i < num_indices; i++) { + intrin->const_index[i] = + (header.intrinsic.packed_const_indices >> (i * bit_size)) & + bit_mask; + } + break; + } + case const_indices_8bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint8(ctx->blob); + break; + case const_indices_16bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint16(ctx->blob); + break; + case const_indices_32bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint32(ctx->blob); + break; + } + } return intrin; } @@ -848,9 +1266,77 @@ write_load_const(write_ctx *ctx, const nir_load_const_instr *lc) header.load_const.instr_type = lc->instr.type; header.load_const.last_component = lc->def.num_components - 1; header.load_const.bit_size = encode_bit_size_3bits(lc->def.bit_size); + header.load_const.packing = load_const_full; + + /* Try to pack 1-component constants into the 19 free bits in the header. */ + if (lc->def.num_components == 1) { + switch (lc->def.bit_size) { + case 64: + if ((lc->value[0].u64 & 0x1fffffffffffull) == 0) { + /* packed_value contains high 19 bits, low bits are 0 */ + header.load_const.packing = load_const_scalar_hi_19bits; + header.load_const.packed_value = lc->value[0].u64 >> 45; + } else if (((lc->value[0].i64 << 45) >> 45) == lc->value[0].i64) { + /* packed_value contains low 19 bits, high bits are sign-extended */ + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u64; + } + break; + + case 32: + if ((lc->value[0].u32 & 0x1fff) == 0) { + header.load_const.packing = load_const_scalar_hi_19bits; + header.load_const.packed_value = lc->value[0].u32 >> 13; + } else if (((lc->value[0].i32 << 13) >> 13) == lc->value[0].i32) { + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u32; + } + break; + + case 16: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u16; + break; + case 8: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u8; + break; + case 1: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].b; + break; + default: + unreachable("invalid bit_size"); + } + } blob_write_uint32(ctx->blob, header.u32); - blob_write_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); + + if (header.load_const.packing == load_const_full) { + switch (lc->def.bit_size) { + case 64: + blob_write_bytes(ctx->blob, lc->value, + sizeof(*lc->value) * lc->def.num_components); + break; + + case 32: + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint32(ctx->blob, lc->value[i].u32); + break; + + case 16: + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint16(ctx->blob, lc->value[i].u16); + break; + + default: + assert(lc->def.bit_size <= 8); + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint8(ctx->blob, lc->value[i].u8); + break; + } + } + write_add_object(ctx, &lc->def); } @@ -861,7 +1347,67 @@ read_load_const(read_ctx *ctx, union packed_instr header) nir_load_const_instr_create(ctx->nir, header.load_const.last_component + 1, decode_bit_size_3bits(header.load_const.bit_size)); - blob_copy_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); + switch (header.load_const.packing) { + case load_const_scalar_hi_19bits: + switch (lc->def.bit_size) { + case 64: + lc->value[0].u64 = (uint64_t)header.load_const.packed_value << 45; + break; + case 32: + lc->value[0].u32 = (uint64_t)header.load_const.packed_value << 13; + break; + default: + unreachable("invalid bit_size"); + } + break; + + case load_const_scalar_lo_19bits_sext: + switch (lc->def.bit_size) { + case 64: + lc->value[0].i64 = ((int64_t)header.load_const.packed_value << 45) >> 45; + break; + case 32: + lc->value[0].i32 = ((int32_t)header.load_const.packed_value << 13) >> 13; + break; + case 16: + lc->value[0].u16 = header.load_const.packed_value; + break; + case 8: + lc->value[0].u8 = header.load_const.packed_value; + break; + case 1: + lc->value[0].b = header.load_const.packed_value; + break; + default: + unreachable("invalid bit_size"); + } + break; + + case load_const_full: + switch (lc->def.bit_size) { + case 64: + blob_copy_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); + break; + + case 32: + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u32 = blob_read_uint32(ctx->blob); + break; + + case 16: + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u16 = blob_read_uint16(ctx->blob); + break; + + default: + assert(lc->def.bit_size <= 8); + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u8 = blob_read_uint8(ctx->blob); + break; + } + break; + } + read_add_object(ctx, &lc->def); return lc; } @@ -897,13 +1443,15 @@ union packed_tex_data { uint32_t u32; struct { enum glsl_sampler_dim sampler_dim:4; - nir_alu_type dest_type:8; + unsigned dest_type:8; unsigned coord_components:3; unsigned is_array:1; unsigned is_shadow:1; unsigned is_new_style_shadow:1; unsigned component:2; - unsigned unused:10; /* Mark unused for valgrind. */ + unsigned texture_non_uniform:1; + unsigned sampler_non_uniform:1; + unsigned unused:8; /* Mark unused for valgrind. */ } u; }; @@ -912,7 +1460,6 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex) { assert(tex->num_srcs < 16); assert(tex->op < 16); - assert(tex->texture_array_size < 1024); union packed_instr header; header.u32 = 0; @@ -920,9 +1467,8 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex) header.tex.instr_type = tex->instr.type; header.tex.num_srcs = tex->num_srcs; header.tex.op = tex->op; - header.tex.texture_array_size = tex->texture_array_size; - write_dest(ctx, &tex->dest, header); + write_dest(ctx, &tex->dest, header, tex->instr.type); blob_write_uint32(ctx->blob, tex->texture_index); blob_write_uint32(ctx->blob, tex->sampler_index); @@ -938,6 +1484,8 @@ write_tex(write_ctx *ctx, const nir_tex_instr *tex) .u.is_shadow = tex->is_shadow, .u.is_new_style_shadow = tex->is_new_style_shadow, .u.component = tex->component, + .u.texture_non_uniform = tex->texture_non_uniform, + .u.sampler_non_uniform = tex->sampler_non_uniform, }; blob_write_uint32(ctx->blob, packed.u32); @@ -958,7 +1506,6 @@ read_tex(read_ctx *ctx, union packed_instr header) tex->op = header.tex.op; tex->texture_index = blob_read_uint32(ctx->blob); - tex->texture_array_size = header.tex.texture_array_size; tex->sampler_index = blob_read_uint32(ctx->blob); if (tex->op == nir_texop_tg4) blob_copy_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); @@ -972,6 +1519,8 @@ read_tex(read_ctx *ctx, union packed_instr header) tex->is_shadow = packed.u.is_shadow; tex->is_new_style_shadow = packed.u.is_new_style_shadow; tex->component = packed.u.component; + tex->texture_non_uniform = packed.u.texture_non_uniform; + tex->sampler_non_uniform = packed.u.sampler_non_uniform; for (unsigned i = 0; i < tex->num_srcs; i++) { union packed_src src = read_src(ctx, &tex->src[i].src, &tex->instr); @@ -995,7 +1544,7 @@ write_phi(write_ctx *ctx, const nir_phi_instr *phi) * and then store enough information so that a later fixup pass can fill * them in correctly. */ - write_dest(ctx, &phi->dest, header); + write_dest(ctx, &phi->dest, header, phi->instr.type); nir_foreach_phi_src(src, phi) { assert(src->src.is_ssa); @@ -1163,7 +1712,8 @@ write_instr(write_ctx *ctx, const nir_instr *instr) } } -static void +/* Return the number of instructions read. */ +static unsigned read_instr(read_ctx *ctx, nir_block *block) { STATIC_ASSERT(sizeof(union packed_instr) == 4); @@ -1173,8 +1723,9 @@ read_instr(read_ctx *ctx, nir_block *block) switch (header.any.instr_type) { case nir_instr_type_alu: - instr = &read_alu(ctx, header)->instr; - break; + for (unsigned i = 0; i <= header.alu.num_followup_alu_sharing_header; i++) + nir_instr_insert_after_block(block, &read_alu(ctx, header)->instr); + return header.alu.num_followup_alu_sharing_header + 1; case nir_instr_type_deref: instr = &read_deref(ctx, header)->instr; break; @@ -1197,7 +1748,7 @@ read_instr(read_ctx *ctx, nir_block *block) * are read so that we can set their sources up. */ read_phi(ctx, block, header); - return; + return 1; case nir_instr_type_jump: instr = &read_jump(ctx, header)->instr; break; @@ -1211,6 +1762,7 @@ read_instr(read_ctx *ctx, nir_block *block) } nir_instr_insert_after_block(block, instr); + return 1; } static void @@ -1218,8 +1770,14 @@ write_block(write_ctx *ctx, const nir_block *block) { write_add_object(ctx, block); blob_write_uint32(ctx->blob, exec_list_length(&block->instr_list)); - nir_foreach_instr(instr, block) + + ctx->last_instr_type = ~0; + ctx->last_alu_header_offset = 0; + + nir_foreach_instr(instr, block) { write_instr(ctx, instr); + ctx->last_instr_type = instr->type; + } } static void @@ -1234,8 +1792,8 @@ read_block(read_ctx *ctx, struct exec_list *cf_list) read_add_object(ctx, block); unsigned num_instrs = blob_read_uint32(ctx->blob); - for (unsigned i = 0; i < num_instrs; i++) { - read_instr(ctx, block); + for (unsigned i = 0; i < num_instrs;) { + i += read_instr(ctx, block); } }