const struct glsl_type *last_interface_type;
struct nir_variable_data last_var_data;
+ /* For skipping equal ALU headers (typical after scalarization). */
+ nir_instr_type last_instr_type;
+ uintptr_t last_alu_header_offset;
+
/* Don't write optional data such as variable names. */
bool strip;
} write_ctx;
return 0;
}
+#define NUM_COMPONENTS_IS_SEPARATE_7 7
+
static uint8_t
encode_num_components_in_3bits(uint8_t num_components)
{
if (num_components == 16)
return 6;
- unreachable("invalid number in num_components");
- return 0;
+ /* special value indicating that num_components is in the next uint32 */
+ return NUM_COMPONENTS_IS_SEPARATE_7;
}
static uint8_t
struct {
unsigned has_name:1;
unsigned has_constant_initializer:1;
+ unsigned has_pointer_initializer:1;
unsigned has_interface_type:1;
unsigned num_state_slots:7;
unsigned data_encoding:2;
unsigned type_same_as_last:1;
unsigned interface_type_same_as_last:1;
- unsigned _pad:2;
+ unsigned _pad:1;
unsigned num_members:16;
} u;
};
write_add_object(ctx, var);
assert(var->num_state_slots < (1 << 7));
- assert(var->num_members < (1 << 16));
STATIC_ASSERT(sizeof(union packed_var) == 4);
union packed_var flags;
flags.u.has_name = !ctx->strip && var->name;
flags.u.has_constant_initializer = !!(var->constant_initializer);
+ flags.u.has_pointer_initializer = !!(var->pointer_initializer);
flags.u.has_interface_type = !!(var->interface_type);
flags.u.type_same_as_last = var->type == ctx->last_type;
flags.u.interface_type_same_as_last =
}
if (var->constant_initializer)
write_constant(ctx, var->constant_initializer);
+ if (var->pointer_initializer)
+ write_lookup_object(ctx, var->pointer_initializer);
if (var->num_members > 0) {
blob_write_bytes(ctx->blob, (uint8_t *) var->members,
var->num_members * sizeof(*var->members));
var->constant_initializer = read_constant(ctx, var);
else
var->constant_initializer = NULL;
+
+ if (flags.u.has_pointer_initializer)
+ var->pointer_initializer = read_object(ctx);
+ else
+ var->pointer_initializer = NULL;
+
var->num_members = flags.u.num_members;
if (var->num_members > 0) {
var->members = ralloc_array(var, struct nir_variable_data,
};
enum intrinsic_const_indices_encoding {
- /* Use the 6 bits of packed_const_indices to store 1-6 indices.
- * 1 6-bit index, or 2 3-bit indices, or 3 2-bit indices, or
- * 4-6 1-bit indices.
+ /* Use the 9 bits of packed_const_indices to store 1-9 indices.
+ * 1 9-bit index, or 2 4-bit indices, or 3 3-bit indices, or
+ * 4 2-bit indices, or 5-9 1-bit indices.
*
* The common case for load_ubo is 0, 0, 0, which is trivially represented.
* The common cases for load_interpolated_input also fit here, e.g.: 7, 3
*/
- const_indices_6bit_all_combined,
+ const_indices_9bit_all_combined,
const_indices_8bit, /* 8 bits per element */
const_indices_16bit, /* 16 bits per element */
unsigned no_signed_wrap:1;
unsigned no_unsigned_wrap:1;
unsigned saturate:1;
- unsigned writemask:4;
+ /* Reg: writemask; SSA: swizzles for 2 srcs */
+ unsigned writemask_or_two_swizzles:4;
unsigned op:9;
unsigned packed_src_ssa_16bit:1;
- unsigned _pad:2;
+ /* Scalarized ALUs always have the same header. */
+ unsigned num_followup_alu_sharing_header:2;
unsigned dest:8;
} alu;
struct {
unsigned deref_type:3;
unsigned cast_type_same_as_last:1;
unsigned mode:10; /* deref_var redefines this */
- unsigned _pad:6; /* deref_var redefines this */
+ unsigned packed_src_ssa_16bit:1; /* deref_var redefines this */
+ unsigned _pad:5; /* deref_var redefines this */
unsigned dest:8;
} deref;
struct {
struct {
unsigned instr_type:4;
unsigned intrinsic:9;
- unsigned num_components:3;
unsigned const_indices_encoding:2;
- unsigned packed_const_indices:6;
+ unsigned packed_const_indices:9;
unsigned dest:8;
} intrinsic;
struct {
unsigned instr_type:4;
unsigned num_srcs:4;
unsigned op:4;
- unsigned texture_array_size:12;
unsigned dest:8;
+ unsigned _pad:12;
} tex;
struct {
unsigned instr_type:4;
/* Write "lo24" as low 24 bits in the first uint32. */
static void
-write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header)
+write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header,
+ nir_instr_type instr_type)
{
STATIC_ASSERT(sizeof(union packed_dest) == 1);
union packed_dest dest;
} else {
dest.reg.is_indirect = !!(dst->reg.indirect);
}
-
header.any.dest = dest.u8;
- blob_write_uint32(ctx->blob, header.u32);
+
+ /* Check if the current ALU instruction has the same header as the previous
+ * instruction that is also ALU. If it is, we don't have to write
+ * the current header. This is a typical occurence after scalarization.
+ */
+ if (instr_type == nir_instr_type_alu) {
+ bool equal_header = false;
+
+ if (ctx->last_instr_type == nir_instr_type_alu) {
+ assert(ctx->last_alu_header_offset);
+ union packed_instr *last_header =
+ (union packed_instr *)(ctx->blob->data +
+ ctx->last_alu_header_offset);
+
+ /* Clear the field that counts ALUs with equal headers. */
+ union packed_instr clean_header;
+ clean_header.u32 = last_header->u32;
+ clean_header.alu.num_followup_alu_sharing_header = 0;
+
+ /* There can be at most 4 consecutive ALU instructions
+ * sharing the same header.
+ */
+ if (last_header->alu.num_followup_alu_sharing_header < 3 &&
+ header.u32 == clean_header.u32) {
+ last_header->alu.num_followup_alu_sharing_header++;
+ equal_header = true;
+ }
+ }
+
+ if (!equal_header) {
+ ctx->last_alu_header_offset = ctx->blob->size;
+ blob_write_uint32(ctx->blob, header.u32);
+ }
+ } else {
+ blob_write_uint32(ctx->blob, header.u32);
+ }
+
+ if (dest.ssa.is_ssa &&
+ dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7)
+ blob_write_uint32(ctx->blob, dst->ssa.num_components);
if (dst->is_ssa) {
write_add_object(ctx, &dst->ssa);
if (dest.ssa.is_ssa) {
unsigned bit_size = decode_bit_size_3bits(dest.ssa.bit_size);
- unsigned num_components =
- decode_num_components_in_3bits(dest.ssa.num_components);
+ unsigned num_components;
+ if (dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7)
+ num_components = blob_read_uint32(ctx->blob);
+ else
+ num_components = decode_num_components_in_3bits(dest.ssa.num_components);
char *name = dest.ssa.has_name ? blob_read_string(ctx->blob) : NULL;
nir_ssa_dest_init(instr, dst, num_components, bit_size, name);
read_add_object(ctx, &dst->ssa);
unsigned src_components = nir_ssa_alu_instr_src_components(alu, i);
for (unsigned chan = 0; chan < src_components; chan++) {
+ /* The swizzles for src0.x and src1.x are stored
+ * in writemask_or_two_swizzles for SSA ALUs.
+ */
+ if (alu->dest.dest.is_ssa && i < 2 && chan == 0 &&
+ alu->src[i].swizzle[chan] < 4)
+ continue;
+
if (alu->src[i].swizzle[chan] != chan)
return false;
}
write_alu(write_ctx *ctx, const nir_alu_instr *alu)
{
unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
+ unsigned dst_components = nir_dest_num_components(alu->dest.dest);
+
/* 9 bits for nir_op */
STATIC_ASSERT(nir_num_opcodes <= 512);
union packed_instr header;
header.alu.no_signed_wrap = alu->no_signed_wrap;
header.alu.no_unsigned_wrap = alu->no_unsigned_wrap;
header.alu.saturate = alu->dest.saturate;
- header.alu.writemask = alu->dest.write_mask;
header.alu.op = alu->op;
header.alu.packed_src_ssa_16bit = is_alu_src_ssa_16bit(ctx, alu);
- write_dest(ctx, &alu->dest.dest, header);
+ if (header.alu.packed_src_ssa_16bit &&
+ alu->dest.dest.is_ssa) {
+ /* For packed srcs of SSA ALUs, this field stores the swizzles. */
+ header.alu.writemask_or_two_swizzles = alu->src[0].swizzle[0];
+ if (num_srcs > 1)
+ header.alu.writemask_or_two_swizzles |= alu->src[1].swizzle[0] << 2;
+ } else if (!alu->dest.dest.is_ssa && dst_components <= 4) {
+ /* For vec4 registers, this field is a writemask. */
+ header.alu.writemask_or_two_swizzles = alu->dest.write_mask;
+ }
+
+ write_dest(ctx, &alu->dest.dest, header, alu->instr.type);
+
+ if (!alu->dest.dest.is_ssa && dst_components > 4)
+ blob_write_uint32(ctx->blob, alu->dest.write_mask);
if (header.alu.packed_src_ssa_16bit) {
for (unsigned i = 0; i < num_srcs; i++) {
}
} else {
for (unsigned i = 0; i < num_srcs; i++) {
+ unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i);
+ unsigned src_components = nir_src_num_components(alu->src[i].src);
union packed_src src;
+ bool packed = src_components <= 4 && src_channels <= 4;
src.u32 = 0;
src.alu.negate = alu->src[i].negate;
src.alu.abs = alu->src[i].abs;
- src.alu.swizzle_x = alu->src[i].swizzle[0];
- src.alu.swizzle_y = alu->src[i].swizzle[1];
- src.alu.swizzle_z = alu->src[i].swizzle[2];
- src.alu.swizzle_w = alu->src[i].swizzle[3];
+
+ if (packed) {
+ src.alu.swizzle_x = alu->src[i].swizzle[0];
+ src.alu.swizzle_y = alu->src[i].swizzle[1];
+ src.alu.swizzle_z = alu->src[i].swizzle[2];
+ src.alu.swizzle_w = alu->src[i].swizzle[3];
+ }
write_src_full(ctx, &alu->src[i].src, src);
+
+ /* Store swizzles for vec8 and vec16. */
+ if (!packed) {
+ for (unsigned o = 0; o < src_channels; o += 8) {
+ unsigned value = 0;
+
+ for (unsigned j = 0; j < 8 && o + j < src_channels; j++) {
+ value |= (uint32_t)alu->src[i].swizzle[o + j] <<
+ (4 * j); /* 4 bits per swizzle */
+ }
+
+ blob_write_uint32(ctx->blob, value);
+ }
+ }
}
}
}
alu->no_signed_wrap = header.alu.no_signed_wrap;
alu->no_unsigned_wrap = header.alu.no_unsigned_wrap;
alu->dest.saturate = header.alu.saturate;
- alu->dest.write_mask = header.alu.writemask;
read_dest(ctx, &alu->dest.dest, &alu->instr, header);
+ unsigned dst_components = nir_dest_num_components(alu->dest.dest);
+
+ if (alu->dest.dest.is_ssa) {
+ alu->dest.write_mask = u_bit_consecutive(0, dst_components);
+ } else if (dst_components <= 4) {
+ alu->dest.write_mask = header.alu.writemask_or_two_swizzles;
+ } else {
+ alu->dest.write_mask = blob_read_uint32(ctx->blob);
+ }
+
if (header.alu.packed_src_ssa_16bit) {
for (unsigned i = 0; i < num_srcs; i++) {
nir_alu_src *src = &alu->src[i];
} else {
for (unsigned i = 0; i < num_srcs; i++) {
union packed_src src = read_src(ctx, &alu->src[i].src, &alu->instr);
+ unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i);
+ unsigned src_components = nir_src_num_components(alu->src[i].src);
+ bool packed = src_components <= 4 && src_channels <= 4;
alu->src[i].negate = src.alu.negate;
alu->src[i].abs = src.alu.abs;
- alu->src[i].swizzle[0] = src.alu.swizzle_x;
- alu->src[i].swizzle[1] = src.alu.swizzle_y;
- alu->src[i].swizzle[2] = src.alu.swizzle_z;
- alu->src[i].swizzle[3] = src.alu.swizzle_w;
+
+ memset(&alu->src[i].swizzle, 0, sizeof(alu->src[i].swizzle));
+
+ if (packed) {
+ alu->src[i].swizzle[0] = src.alu.swizzle_x;
+ alu->src[i].swizzle[1] = src.alu.swizzle_y;
+ alu->src[i].swizzle[2] = src.alu.swizzle_z;
+ alu->src[i].swizzle[3] = src.alu.swizzle_w;
+ } else {
+ /* Load swizzles for vec8 and vec16. */
+ for (unsigned o = 0; o < src_channels; o += 8) {
+ unsigned value = blob_read_uint32(ctx->blob);
+
+ for (unsigned j = 0; j < 8 && o + j < src_channels; j++) {
+ alu->src[i].swizzle[o + j] =
+ (value >> (4 * j)) & 0xf; /* 4 bits per swizzle */
+ }
+ }
+ }
}
}
+ if (header.alu.packed_src_ssa_16bit &&
+ alu->dest.dest.is_ssa) {
+ alu->src[0].swizzle[0] = header.alu.writemask_or_two_swizzles & 0x3;
+ if (num_srcs > 1)
+ alu->src[1].swizzle[0] = header.alu.writemask_or_two_swizzles >> 2;
+ }
+
return alu;
}
header.deref_var.object_idx = var_idx;
}
- write_dest(ctx, &deref->dest, header);
+ if (deref->deref_type == nir_deref_type_array ||
+ deref->deref_type == nir_deref_type_ptr_as_array) {
+ header.deref.packed_src_ssa_16bit =
+ deref->parent.is_ssa && deref->arr.index.is_ssa &&
+ are_object_ids_16bit(ctx);
+ }
+
+ write_dest(ctx, &deref->dest, header, deref->instr.type);
switch (deref->deref_type) {
case nir_deref_type_var:
case nir_deref_type_array:
case nir_deref_type_ptr_as_array:
- write_src(ctx, &deref->parent);
- write_src(ctx, &deref->arr.index);
+ if (header.deref.packed_src_ssa_16bit) {
+ blob_write_uint16(ctx->blob,
+ write_lookup_object(ctx, deref->parent.ssa));
+ blob_write_uint16(ctx->blob,
+ write_lookup_object(ctx, deref->arr.index.ssa));
+ } else {
+ write_src(ctx, &deref->parent);
+ write_src(ctx, &deref->arr.index);
+ }
break;
case nir_deref_type_cast:
case nir_deref_type_array:
case nir_deref_type_ptr_as_array:
- read_src(ctx, &deref->parent, &deref->instr);
+ if (header.deref.packed_src_ssa_16bit) {
+ deref->parent.is_ssa = true;
+ deref->parent.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob));
+ deref->arr.index.is_ssa = true;
+ deref->arr.index.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob));
+ } else {
+ read_src(ctx, &deref->parent, &deref->instr);
+ read_src(ctx, &deref->arr.index, &deref->instr);
+ }
+
parent = nir_src_as_deref(deref->parent);
if (deref->deref_type == nir_deref_type_array)
deref->type = glsl_get_array_element(parent->type);
else
deref->type = parent->type;
- read_src(ctx, &deref->arr.index, &deref->instr);
break;
case nir_deref_type_cast:
header.intrinsic.instr_type = intrin->instr.type;
header.intrinsic.intrinsic = intrin->intrinsic;
- header.intrinsic.num_components =
- encode_num_components_in_3bits(intrin->num_components);
/* Analyze constant indices to decide how to encode them. */
if (num_indices) {
max_bits = MAX2(max_bits, max);
}
- if (max_bits * num_indices <= 6) {
- header.intrinsic.const_indices_encoding = const_indices_6bit_all_combined;
+ if (max_bits * num_indices <= 9) {
+ header.intrinsic.const_indices_encoding = const_indices_9bit_all_combined;
/* Pack all const indices into 6 bits. */
- unsigned bit_size = 6 / num_indices;
+ unsigned bit_size = 9 / num_indices;
for (unsigned i = 0; i < num_indices; i++) {
header.intrinsic.packed_const_indices |=
intrin->const_index[i] << (i * bit_size);
}
if (nir_intrinsic_infos[intrin->intrinsic].has_dest)
- write_dest(ctx, &intrin->dest, header);
+ write_dest(ctx, &intrin->dest, header, intrin->instr.type);
else
blob_write_uint32(ctx->blob, header.u32);
unsigned num_srcs = nir_intrinsic_infos[op].num_srcs;
unsigned num_indices = nir_intrinsic_infos[op].num_indices;
- intrin->num_components =
- decode_num_components_in_3bits(header.intrinsic.num_components);
-
if (nir_intrinsic_infos[op].has_dest)
read_dest(ctx, &intrin->dest, &intrin->instr, header);
for (unsigned i = 0; i < num_srcs; i++)
read_src(ctx, &intrin->src[i], &intrin->instr);
+ /* Vectorized instrinsics have num_components same as dst or src that has
+ * 0 components in the info. Find it.
+ */
+ if (nir_intrinsic_infos[op].has_dest &&
+ nir_intrinsic_infos[op].dest_components == 0) {
+ intrin->num_components = nir_dest_num_components(intrin->dest);
+ } else {
+ for (unsigned i = 0; i < num_srcs; i++) {
+ if (nir_intrinsic_infos[op].src_components[i] == 0) {
+ intrin->num_components = nir_src_num_components(intrin->src[i]);
+ break;
+ }
+ }
+ }
+
if (num_indices) {
switch (header.intrinsic.const_indices_encoding) {
- case const_indices_6bit_all_combined: {
- unsigned bit_size = 6 / num_indices;
+ case const_indices_9bit_all_combined: {
+ unsigned bit_size = 9 / num_indices;
unsigned bit_mask = u_bit_consecutive(0, bit_size);
for (unsigned i = 0; i < num_indices; i++) {
intrin->const_index[i] =
union packed_tex_data {
uint32_t u32;
struct {
- enum glsl_sampler_dim sampler_dim:4;
- nir_alu_type dest_type:8;
+ unsigned sampler_dim:4;
+ unsigned dest_type:8;
unsigned coord_components:3;
unsigned is_array:1;
unsigned is_shadow:1;
unsigned is_new_style_shadow:1;
unsigned component:2;
- unsigned unused:10; /* Mark unused for valgrind. */
+ unsigned texture_non_uniform:1;
+ unsigned sampler_non_uniform:1;
+ unsigned unused:8; /* Mark unused for valgrind. */
} u;
};
{
assert(tex->num_srcs < 16);
assert(tex->op < 16);
- assert(tex->texture_array_size < 1024);
union packed_instr header;
header.u32 = 0;
header.tex.instr_type = tex->instr.type;
header.tex.num_srcs = tex->num_srcs;
header.tex.op = tex->op;
- header.tex.texture_array_size = tex->texture_array_size;
- write_dest(ctx, &tex->dest, header);
+ write_dest(ctx, &tex->dest, header, tex->instr.type);
blob_write_uint32(ctx->blob, tex->texture_index);
blob_write_uint32(ctx->blob, tex->sampler_index);
.u.is_shadow = tex->is_shadow,
.u.is_new_style_shadow = tex->is_new_style_shadow,
.u.component = tex->component,
+ .u.texture_non_uniform = tex->texture_non_uniform,
+ .u.sampler_non_uniform = tex->sampler_non_uniform,
};
blob_write_uint32(ctx->blob, packed.u32);
tex->op = header.tex.op;
tex->texture_index = blob_read_uint32(ctx->blob);
- tex->texture_array_size = header.tex.texture_array_size;
tex->sampler_index = blob_read_uint32(ctx->blob);
if (tex->op == nir_texop_tg4)
blob_copy_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets));
tex->is_shadow = packed.u.is_shadow;
tex->is_new_style_shadow = packed.u.is_new_style_shadow;
tex->component = packed.u.component;
+ tex->texture_non_uniform = packed.u.texture_non_uniform;
+ tex->sampler_non_uniform = packed.u.sampler_non_uniform;
for (unsigned i = 0; i < tex->num_srcs; i++) {
union packed_src src = read_src(ctx, &tex->src[i].src, &tex->instr);
* and then store enough information so that a later fixup pass can fill
* them in correctly.
*/
- write_dest(ctx, &phi->dest, header);
+ write_dest(ctx, &phi->dest, header, phi->instr.type);
nir_foreach_phi_src(src, phi) {
assert(src->src.is_ssa);
}
}
-static void
+/* Return the number of instructions read. */
+static unsigned
read_instr(read_ctx *ctx, nir_block *block)
{
STATIC_ASSERT(sizeof(union packed_instr) == 4);
switch (header.any.instr_type) {
case nir_instr_type_alu:
- instr = &read_alu(ctx, header)->instr;
- break;
+ for (unsigned i = 0; i <= header.alu.num_followup_alu_sharing_header; i++)
+ nir_instr_insert_after_block(block, &read_alu(ctx, header)->instr);
+ return header.alu.num_followup_alu_sharing_header + 1;
case nir_instr_type_deref:
instr = &read_deref(ctx, header)->instr;
break;
* are read so that we can set their sources up.
*/
read_phi(ctx, block, header);
- return;
+ return 1;
case nir_instr_type_jump:
instr = &read_jump(ctx, header)->instr;
break;
}
nir_instr_insert_after_block(block, instr);
+ return 1;
}
static void
{
write_add_object(ctx, block);
blob_write_uint32(ctx->blob, exec_list_length(&block->instr_list));
- nir_foreach_instr(instr, block)
+
+ ctx->last_instr_type = ~0;
+ ctx->last_alu_header_offset = 0;
+
+ nir_foreach_instr(instr, block) {
write_instr(ctx, instr);
+ ctx->last_instr_type = instr->type;
+ }
}
static void
read_add_object(ctx, block);
unsigned num_instrs = blob_read_uint32(ctx->blob);
- for (unsigned i = 0; i < num_instrs; i++) {
- read_instr(ctx, block);
+ for (unsigned i = 0; i < num_instrs;) {
+ i += read_instr(ctx, block);
}
}
static void
write_function_impl(write_ctx *ctx, const nir_function_impl *fi)
{
+ blob_write_uint8(ctx->blob, fi->structured);
+
write_var_list(ctx, &fi->locals);
write_reg_list(ctx, &fi->registers);
blob_write_uint32(ctx->blob, fi->reg_alloc);
nir_function_impl *fi = nir_function_impl_create_bare(ctx->nir);
fi->function = fxn;
+ fi->structured = blob_read_uint8(ctx->blob);
+
read_var_list(ctx, &fi->locals);
read_reg_list(ctx, &fi->registers);
fi->reg_alloc = blob_read_uint32(ctx->blob);
info.name = info.label = NULL;
blob_write_bytes(blob, (uint8_t *) &info, sizeof(info));
- write_var_list(&ctx, &nir->uniforms);
- write_var_list(&ctx, &nir->inputs);
- write_var_list(&ctx, &nir->outputs);
- write_var_list(&ctx, &nir->shared);
- write_var_list(&ctx, &nir->globals);
- write_var_list(&ctx, &nir->system_values);
+ write_var_list(&ctx, &nir->variables);
blob_write_uint32(blob, nir->num_inputs);
blob_write_uint32(blob, nir->num_uniforms);
ctx.nir->info = info;
- read_var_list(&ctx, &ctx.nir->uniforms);
- read_var_list(&ctx, &ctx.nir->inputs);
- read_var_list(&ctx, &ctx.nir->outputs);
- read_var_list(&ctx, &ctx.nir->shared);
- read_var_list(&ctx, &ctx.nir->globals);
- read_var_list(&ctx, &ctx.nir->system_values);
+ read_var_list(&ctx, &ctx.nir->variables);
ctx.nir->num_inputs = blob_read_uint32(blob);
ctx.nir->num_uniforms = blob_read_uint32(blob);