if (ins->src[src] >= l->node_count)
return;
- bool vector = (bi_class_props[ins->type] & BI_VECTOR);
+ bool vector = (bi_class_props[ins->type] & BI_VECTOR) && src == 0;
unsigned offset = 0;
if (vector) {
unsigned size = nir_alu_type_get_type_size(T);
unsigned bytes = (MAX2(size, 8) / 8);
unsigned comps_per_reg = 4 / bytes;
- unsigned components = bi_get_component_count(ins);
+ unsigned components = bi_get_component_count(ins, src);
for (unsigned i = 0; i < components; ++i) {
unsigned off = ins->swizzle[src][i] / comps_per_reg;
address.src_types[2] = nir_type_uint32;
address.src_types[3] = nir_intrinsic_type(instr);
address.dest = bi_make_temp(ctx);
- address.dest_type = nir_type_uint64;
- address.writemask = (1 << 8) - 1;
+ address.dest_type = nir_type_uint32;
+ address.writemask = (1 << 12) - 1;
bi_instruction st = {
.type = BI_STORE_VAR,
.src = {
- address.dest,
- bir_src_index(&instr->src[0])
+ bir_src_index(&instr->src[0]),
+ address.dest, address.dest, address.dest,
},
.src_types = {
- nir_type_uint64,
- nir_type_uint32
+ nir_type_uint32,
+ nir_type_uint32, nir_type_uint32, nir_type_uint32,
},
.swizzle = {
- { 0 },
- { 0, 1, 2, 3 }
+ { 0, 1, 2, 3 },
+ { 0 }, { 1 }, { 2}
}
};
}
unsigned
-bi_get_component_count(bi_instruction *ins)
+bi_get_component_count(bi_instruction *ins, unsigned src)
{
if (bi_class_props[ins->type] & BI_VECTOR) {
- return 4;
+ return (src == 0) ? 4 : 1;
} else {
/* Stores imply VECTOR */
assert(ins->dest_type);
bi_bytemask_of_read_components(bi_instruction *ins, unsigned node)
{
uint16_t mask = 0x0;
- unsigned component_count = bi_get_component_count(ins);
bi_foreach_src(ins, s) {
if (ins->src[s] != node) continue;
+ unsigned component_count = bi_get_component_count(ins, s);
nir_alu_type T = ins->src_types[s];
unsigned size = nir_alu_type_get_type_size(T);
unsigned bytes = (MAX2(size, 8) / 8);
* the end of a clause. Implies ADD */
#define BI_SCHED_HI_LATENCY (1 << 7)
-/* Intrinsic is vectorized and should read 4 components regardless of writemask */
+/* Intrinsic is vectorized and should read 4 components in the first source
+ * regardless of writemask */
#define BI_VECTOR (1 << 8)
/* Use a data register for src0/dest respectively, bypassing the usual
bool bi_is_src_swizzled(bi_instruction *ins, unsigned s);
bool bi_has_arg(bi_instruction *ins, unsigned arg);
uint16_t bi_from_bytemask(uint16_t bytemask, unsigned bytes);
-unsigned bi_get_component_count(bi_instruction *ins);
+unsigned bi_get_component_count(bi_instruction *ins, unsigned s);
unsigned bi_load32_components(bi_instruction *ins);
uint16_t bi_bytemask_of_read_components(bi_instruction *ins, unsigned node);