From fad158a0e01f4c28851477e6d1eb5c8fd67e226b Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 10 Jan 2016 14:10:08 -0500 Subject: [PATCH] freedreno/ir3: array rework Signed-off-by: Rob Clark --- .../drivers/freedreno/freedreno_screen.c | 9 +- src/gallium/drivers/freedreno/ir3/ir3.c | 35 +- src/gallium/drivers/freedreno/ir3/ir3.h | 91 +++-- .../drivers/freedreno/ir3/ir3_compiler_nir.c | 333 ++++-------------- src/gallium/drivers/freedreno/ir3/ir3_cp.c | 29 +- src/gallium/drivers/freedreno/ir3/ir3_depth.c | 4 + src/gallium/drivers/freedreno/ir3/ir3_print.c | 34 +- src/gallium/drivers/freedreno/ir3/ir3_ra.c | 190 +++++++--- src/gallium/drivers/freedreno/ir3/ir3_sched.c | 3 + 9 files changed, 365 insertions(+), 363 deletions(-) diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index a75b04b327a..6562924dae1 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -400,9 +400,16 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 1; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + /* Technically this should be the same as for TEMP/CONST, since + * everything is just normal registers. This is just temporary + * hack until load_input/store_output handle arrays in a similar + * way as load_var/store_var.. + */ + return 0; case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - return 1; + /* a2xx compiler doesn't handle indirect: */ + return is_ir3(screen) ? 1 : 0; case PIPE_SHADER_CAP_SUBROUTINES: case PIPE_SHADER_CAP_DOUBLES: case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index b24825cff85..be415d8e5fe 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler, shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout); list_inithead(&shader->block_list); + list_inithead(&shader->array_list); return shader; } @@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, val.iim_val = reg->iim_val; } else { unsigned components; + int16_t max; if (reg->flags & IR3_REG_RELATIV) { components = reg->size; - val.dummy10 = reg->offset; + val.dummy10 = reg->array.offset; + max = (reg->array.offset + repeat + components - 1) >> 2; } else { components = util_last_bit(reg->wrmask); val.comp = reg->num & 0x3; val.num = reg->num >> 2; + max = (reg->num + repeat + components - 1) >> 2; } - int16_t max = (reg->num + repeat + components - 1) >> 2; - if (reg->flags & IR3_REG_CONST) { info->max_const = MAX2(info->max_const, max); } else if (val.num == 63) { @@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr, iassert((instr->regs_count == 2) || (instr->regs_count == 3)); if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); + iassert(src1->array.offset < (1 << 10)); cat2->rel1.src1 = reg(src1, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr, !((src1->flags ^ src2->flags) & IR3_REG_HALF)); if (src2->flags & IR3_REG_RELATIV) { - iassert(src2->num < (1 << 10)); + iassert(src2->array.offset < (1 << 10)); cat2->rel2.src2 = reg(src2, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr, iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); + iassert(src1->array.offset < (1 << 10)); cat3->rel1.src1 = reg(src1, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr, if (src3->flags & IR3_REG_RELATIV) { - iassert(src3->num < (1 << 10)); + iassert(src3->array.offset < (1 << 10)); cat3->rel2.src3 = reg(src3, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg); @@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr, iassert(instr->regs_count == 2); if (src->flags & IR3_REG_RELATIV) { - iassert(src->num < (1 << 10)); + iassert(src->array.offset < (1 << 10)); cat4->rel.src = reg(src, info, instr->repeat, IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF); @@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, return reg; } +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg) +{ + struct ir3_register *new_reg = reg_create(shader, 0, 0); + *new_reg = *reg; + return new_reg; +} + void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr) @@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir) } return cnt; } + +struct ir3_array * +ir3_lookup_array(struct ir3 *ir, unsigned id) +{ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) + if (arr->id == id) + return arr; + return NULL; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 62d14a0ae37..1e5a1e9ee8b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -83,7 +83,8 @@ struct ir3_register { * before register assignment is done: */ IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */ - IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */ + IR3_REG_ARRAY = 0x4000, + IR3_REG_PHI_SRC= 0x8000, /* phi src, regs[0]->instr points to phi */ } flags; union { @@ -97,11 +98,18 @@ struct ir3_register { uint32_t uim_val; float fim_val; /* relative: */ - int offset; + struct { + uint16_t id; + uint16_t offset; + } array; }; - /* for IR3_REG_SSA, src registers contain ptr back to - * assigning instruction. + /* For IR3_REG_SSA, src registers contain ptr back to assigning + * instruction. + * + * For IR3_REG_ARRAY, the pointer is back to the last dependent + * array access (although the net effect is the same, it points + * back to a previous instruction that we depend on). */ struct ir3_instruction *instr; @@ -221,9 +229,6 @@ struct ir3_instruction { struct { int off; /* component/offset */ } fo; - struct { - int aid; - } fi; struct { /* used to temporarily hold reference to nir_phi_instr * until we resolve the phi srcs @@ -293,19 +298,6 @@ struct ir3_instruction { */ struct ir3_instruction *address; - /* in case of a instruction with relative dst instruction, we need to - * capture the dependency on the fanin for the previous values of - * the array elements. Since we don't know at compile time actually - * which array elements are written, this serves to preserve the - * unconditional write to array elements prior to the conditional - * write. - * - * TODO only cat1 can do indirect write.. we could maybe move this - * into instr->cat1.fanin (but would require the frontend to insert - * the extra mov) - */ - struct ir3_instruction *fanin; - /* Entry in ir3_block's instruction list: */ struct list_head node; @@ -379,10 +371,39 @@ struct ir3 { /* List of blocks: */ struct list_head block_list; + /* List of ir3_array's: */ + struct list_head array_list; + unsigned heap_idx; struct ir3_heap_chunk *chunk; }; +typedef struct nir_variable nir_variable; + +struct ir3_array { + struct list_head node; + unsigned length; + unsigned id; + + nir_variable *var; + + /* We track the last write and last access (read or write) to + * setup dependencies on instructions that read or write the + * array. Reads can be re-ordered wrt. other reads, but should + * not be re-ordered wrt. to writes. Writes cannot be reordered + * wrt. any other access to the array. + * + * So array reads depend on last write, and array writes depend + * on the last access. + */ + struct ir3_instruction *last_write, *last_access; + + /* extra stuff used in RA pass: */ + unsigned base; +}; + +struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id); + typedef struct nir_block nir_block; struct ir3_block { @@ -430,6 +451,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); +struct ir3_register * ir3_reg_clone(struct ir3 *shader, + struct ir3_register *reg); void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr); @@ -510,6 +533,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr) if (dst->num == regid(REG_A0, 0)) return false; + if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) + return false; + if ((instr->category == 1) && (instr->cat1.src_type == instr->cat1.dst_type)) return true; @@ -623,8 +649,10 @@ static inline bool writes_pred(struct ir3_instruction *instr) /* TODO better name */ static inline struct ir3_instruction *ssa(struct ir3_register *reg) { - if (reg->flags & IR3_REG_SSA) + if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) { + debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED))); return reg->instr; + } return NULL; } @@ -813,8 +841,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc) static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) { - if (instr->fanin) - return instr->regs_count + 2; if (instr->address) return instr->regs_count + 1; return instr->regs_count; @@ -822,8 +848,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) { - if (n == (instr->regs_count + 1)) - return instr->fanin; if (n == (instr->regs_count + 0)) return instr->address; return ssa(instr->regs[n]); @@ -834,8 +858,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr /* iterator for an instruction's SSA sources (instr), also returns src #: */ #define foreach_ssa_src_n(__srcinst, __n, __instr) \ if ((__instr)->regs_count) \ - for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \ - if ((__srcinst = __ssa_src_n(__instr, __n + 1))) + for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ + if ((__srcinst = __ssa_src_n(__instr, __n))) /* iterator for an instruction's SSA sources (instr): */ #define foreach_ssa_src(__srcinst, __instr) \ @@ -878,7 +902,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) struct ir3_instruction *instr = ir3_instr_create(block, 1, 0); ir3_reg_create(instr, 0, 0); /* dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + if (src->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_register *src_reg = + ir3_reg_create(instr, 0, IR3_REG_ARRAY); + src_reg->array = src->regs[0]->array; + src_reg->instr = src; + } else { + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + } + debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV)); instr->cat1.src_type = type; instr->cat1.dst_type = type; return instr; @@ -894,6 +926,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src, ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; instr->cat1.src_type = src_type; instr->cat1.dst_type = dst_type; + debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY)); return instr; } @@ -1083,7 +1116,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8]; static inline unsigned regmask_idx(struct ir3_register *reg) { - unsigned num = reg->num; + unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num; debug_assert(num < MAX_REG); if (reg->flags & IR3_REG_HALF) num += MAX_REG; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index e5d39097267..bd0ee89a1ec 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -74,8 +74,6 @@ struct ir3_compile { /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; - /* mapping from nir_variable to ir3_array: */ - struct hash_table *var_ht; unsigned num_arrays; /* a common pattern for indirect addressing is to request the @@ -142,8 +140,6 @@ compile_init(struct ir3_compiler *compiler, ctx->so = so; ctx->def_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->var_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); @@ -220,206 +216,26 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } -/* global per-array information: */ -struct ir3_array { - unsigned length, aid; -}; - -/* per-block array state: */ -struct ir3_array_value { - /* TODO drop length/aid, and just have ptr back to ir3_array */ - unsigned length, aid; - /* initial array element values are phi's, other than for the - * entry block. The phi src's get added later in a resolve step - * after we have visited all the blocks, to account for back - * edges in the cfg. - */ - struct ir3_instruction **phis; - /* current array element values (as block is processed). When - * the array phi's are resolved, it will contain the array state - * at exit of block, so successor blocks can use it to add their - * phi srcs. - */ - struct ir3_instruction *arr[]; -}; - -/* track array assignments per basic block. When an array is read - * outside of the same basic block, we can use NIR's dominance-frontier - * information to figure out where phi nodes are needed. - */ -struct ir3_nir_block_data { - unsigned foo; - /* indexed by array-id (aid): */ - struct ir3_array_value *arrs[]; -}; - -static struct ir3_nir_block_data * -get_block_data(struct ir3_compile *ctx, struct ir3_block *block) -{ - if (!block->data) { - struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + - ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); - block->data = bd; - } - return block->data; -} - static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ struct ir3_array *arr = ralloc(ctx, struct ir3_array); + arr->id = ++ctx->num_arrays; arr->length = length; - arr->aid = ++ctx->num_arrays; - _mesa_hash_table_insert(ctx->var_ht, var, arr); + arr->var = var; + list_addtail(&arr->node, &ctx->ir->array_list); } -static nir_block * -nir_block_pred(nir_block *block) -{ - assert(block->predecessors->entries < 2); - if (block->predecessors->entries == 0) - return NULL; - return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; -} - -static struct ir3_array_value * +static struct ir3_array * get_var(struct ir3_compile *ctx, nir_variable *var) { - struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - struct ir3_block *block = ctx->block; - struct ir3_nir_block_data *bd = get_block_data(ctx, block); - struct ir3_array *arr = entry->data; - - if (!bd->arrs[arr->aid]) { - struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + - (arr->length * sizeof(av->arr[0]))); - struct ir3_array_value *defn = NULL; - nir_block *pred_block; - - av->length = arr->length; - av->aid = arr->aid; - - /* For loops, we have to consider that we have not visited some - * of the blocks who should feed into the phi (ie. back-edges in - * the cfg).. for example: - * - * loop { - * block { load_var; ... } - * if then block {} else block {} - * block { store_var; ... } - * if then block {} else block {} - * block {...} - * } - * - * We can skip the phi if we can chase the block predecessors - * until finding the block previously defining the array without - * crossing a block that has more than one predecessor. - * - * Otherwise create phi's and resolve them as a post-pass after - * all the blocks have been visited (to handle back-edges). - */ - - for (pred_block = block->nblock; - pred_block && (pred_block->predecessors->entries < 2) && !defn; - pred_block = nir_block_pred(pred_block)) { - struct ir3_block *pblock = get_block(ctx, pred_block); - struct ir3_nir_block_data *pbd = pblock->data; - if (!pbd) - continue; - defn = pbd->arrs[arr->aid]; - } - - if (defn) { - /* only one possible definer: */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = defn->arr[i]; - } else if (pred_block) { - /* not the first block, and multiple potential definers: */ - av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); - - for (unsigned i = 0; i < arr->length; i++) { - struct ir3_instruction *phi; - - phi = ir3_instr_create2(block, -1, OPC_META_PHI, - 1 + ctx->impl->num_blocks); - ir3_reg_create(phi, 0, 0); /* dst */ - - /* phi's should go at head of block: */ - list_delinit(&phi->node); - list_add(&phi->node, &block->instr_list); - - av->phis[i] = av->arr[i] = phi; - } - } else { - /* Some shaders end up reading array elements without - * first writing.. so initialize things to prevent null - * instr ptrs later: - */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = create_immed(block, 0); - } - - bd->arrs[arr->aid] = av; - } - - return bd->arrs[arr->aid]; -} - -static void -add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, - struct ir3_array_value *av, BITSET_WORD *visited) -{ - struct ir3_block *block; - struct ir3_nir_block_data *bd; - - if (BITSET_TEST(visited, nblock->index)) - return; - - BITSET_SET(visited, nblock->index); - - block = get_block(ctx, nblock); - bd = block->data; - - if (bd && bd->arrs[av->aid]) { - struct ir3_array_value *dav = bd->arrs[av->aid]; - for (unsigned i = 0; i < av->length; i++) { - ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = - dav->arr[i]; - } - } else { - /* didn't find defn, recurse predecessors: */ - struct set_entry *entry; - set_foreach(nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } - } -} - -static void -resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) -{ - struct ir3_nir_block_data *bd = block->data; - unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); - - if (!bd) - return; - - /* TODO use nir dom_frontier to help us with this? */ - - for (unsigned i = 1; i <= ctx->num_arrays; i++) { - struct ir3_array_value *av = bd->arrs[i]; - BITSET_WORD visited[bitset_words]; - struct set_entry *entry; - - if (!(av && av->phis)) - continue; - - memset(visited, 0, sizeof(visited)); - set_foreach(block->nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + if (arr->var == var) + return arr; } + compile_error(ctx, "bogus var: %s\n", var->name); + return NULL; } /* allocate a n element value array (to be populated by caller) and @@ -437,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n) static struct ir3_instruction ** get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n) { + compile_assert(ctx, dst->is_ssa); if (dst->is_ssa) { return __get_dst(ctx, &dst->ssa, n); } else { @@ -454,6 +271,7 @@ static struct ir3_instruction ** get_src(struct ir3_compile *ctx, nir_src *src) { struct hash_entry *entry; + compile_assert(ctx, src->is_ssa); if (src->is_ssa) { entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); } else { @@ -568,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n, mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); + ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; ir3_instr_set_address(mov, address); @@ -607,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); src->instr = collect; src->size = arrsz; - src->offset = n; + src->array.offset = n; ir3_instr_set_address(mov, address); return mov; } +/* relative (indirect) if address!=NULL */ +static struct ir3_instruction * +create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n, + struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, 1, 0); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + src->instr = arr->last_write; + src->size = arr->length; + src->array.id = arr->id; + src->array.offset = n; + + if (address) + ir3_instr_set_address(mov, address); + + arr->last_access = mov; + + return mov; +} + +/* relative (indirect) if address!=NULL */ static struct ir3_instruction * -create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, - struct ir3_instruction *src, struct ir3_instruction *address, - struct ir3_instruction *collect) +create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, unsigned n, + struct ir3_instruction *src, struct ir3_instruction *address) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; @@ -626,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, mov = ir3_instr_create(block, 1, 0); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; - dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV); - dst->size = arrsz; - dst->offset = n; + dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + dst->instr = arr->last_access; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - mov->fanin = collect; ir3_instr_set_address(mov, address); + arr->last_write = arr->last_access = mov; + return mov; } @@ -1198,7 +1048,7 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); + struct ir3_array *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1209,19 +1059,17 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = arr->arr[n]; + dst[i] = create_var_load(ctx, arr, n, NULL); } break; case nir_deref_array_type_indirect: { /* for indirect, we need to collect all the array elements: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); struct ir3_instruction *addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect); + dst[i] = create_var_load(ctx, arr, n, addr); } break; } @@ -1238,8 +1086,9 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); - struct ir3_instruction **src; + struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_instruction *addr, **src; + unsigned wrmask = intr->const_index[0]; compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1248,66 +1097,24 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (darr->deref_array_type) { case nir_deref_array_type_direct: - /* direct access does not require anything special: */ - for (int i = 0; i < intr->num_components; i++) { - /* ttn doesn't generate partial writemasks */ - assert(intr->const_index[0] == - (1 << intr->num_components) - 1); - - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - arr->arr[n] = src[i]; - } + addr = NULL; break; - case nir_deref_array_type_indirect: { - /* for indirect, create indirect-store and fan that out: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); - struct ir3_instruction *addr = - get_addr(ctx, get_src(ctx, &darr->indirect)[0]); - for (int i = 0; i < intr->num_components; i++) { - /* ttn doesn't generate partial writemasks */ - assert(intr->const_index[0] == - (1 << intr->num_components) - 1); - - struct ir3_instruction *store; - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - - store = create_indirect_store(ctx, arr->length, - n, src[i], addr, collect); - - store->fanin->fi.aid = arr->aid; - - /* TODO: probably split this out to be used for - * store_output_indirect? or move this into - * create_indirect_store()? - */ - for (int j = i; j < arr->length; j += intr->num_components) { - struct ir3_instruction *split; - - split = ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = j; - ir3_reg_create(split, 0, 0); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store; - - arr->arr[j] = split; - } - } - /* fixup fanout/split neighbors: */ - for (int i = 0; i < arr->length; i++) { - arr->arr[i]->cp.right = (i < (arr->length - 1)) ? - arr->arr[i+1] : NULL; - arr->arr[i]->cp.left = (i > 0) ? - arr->arr[i-1] : NULL; - } + case nir_deref_array_type_indirect: + addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); break; - } default: compile_error(ctx, "Unhandled store deref type: %u\n", darr->deref_array_type); break; } + + for (int i = 0; i < intr->num_components; i++) { + if (!(wrmask & (1 << i))) + continue; + unsigned n = darr->base_offset * 4 + i; + compile_assert(ctx, n < arr->length); + create_var_store(ctx, arr, n, src[i], addr); + } } static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, @@ -1835,8 +1642,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; } } - - resolve_array_phis(ctx, block); } static void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index a6e69d2416f..0d88e7bc3ab 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -202,6 +202,7 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags) *dstflags |= srcflags & IR3_REG_CONST; *dstflags |= srcflags & IR3_REG_IMMED; *dstflags |= srcflags & IR3_REG_RELATIV; + *dstflags |= srcflags & IR3_REG_ARRAY; } /* the "plain" MAD's (ie. the ones that don't shift first src prior to @@ -233,6 +234,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) combine_flags(&new_flags, src_reg->flags); if (valid_flags(instr, n, new_flags)) { + if (new_flags & IR3_REG_ARRAY) { + debug_assert(!(reg->flags & IR3_REG_ARRAY)); + reg->array = src_reg->array; + } reg->flags = new_flags; reg->instr = ssa(src_reg); } @@ -283,6 +288,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) conflicts(instr->address, reg->instr->address)) return; + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; instr->regs[n+1] = src_reg; @@ -294,6 +300,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) if ((src_reg->flags & IR3_REG_RELATIV) && !conflicts(instr->address, reg->instr->address)) { + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; instr->regs[n+1] = src_reg; ir3_instr_set_address(instr, reg->instr->address); @@ -329,6 +336,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n) /* other than category 1 (mov) we can only encode up to 10 bits: */ if ((instr->category == 1) || !(iim_val & ~0x3ff)) { new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT); + src_reg = ir3_reg_clone(instr->block->shader, src_reg); src_reg->flags = new_flags; src_reg->iim_val = iim_val; instr->regs[n+1] = src_reg; @@ -349,9 +357,11 @@ eliminate_output_mov(struct ir3_instruction *instr) { if (is_eligible_mov(instr, false)) { struct ir3_register *reg = instr->regs[1]; - struct ir3_instruction *src_instr = ssa(reg); - debug_assert(src_instr); - return src_instr; + if (!(reg->flags & IR3_REG_ARRAY)) { + struct ir3_instruction *src_instr = ssa(reg); + debug_assert(src_instr); + return src_instr; + } } return instr; } @@ -379,9 +389,22 @@ instr_cp(struct ir3_instruction *instr) continue; instr_cp(src); + + /* TODO non-indirect access we could figure out which register + * we actually want and allow cp.. + */ + if (reg->flags & IR3_REG_ARRAY) + continue; + reg_cp(instr, reg, n); } + if (instr->regs[0]->flags & IR3_REG_ARRAY) { + struct ir3_instruction *src = ssa(instr->regs[0]); + if (src) + instr_cp(src); + } + if (instr->address) { instr_cp(instr->address); ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index 4bbc0458790..3354cbd23fa 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr) /* visit child to compute it's depth: */ ir3_instr_depth(src); + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; + sd = ir3_delayslots(src, instr, i) + src->depth; instr->depth = MAX2(instr->depth, sd); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c index a84e7989cf8..ec832f5d72a 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr) } } -static void print_reg_name(struct ir3_register *reg, bool followssa) +static void print_reg_name(struct ir3_register *reg) { if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) && (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))) @@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa) if (reg->flags & IR3_REG_IMMED) { printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_SSA) { - printf("_"); - if (followssa) { - printf("["); + } else if (reg->flags & IR3_REG_ARRAY) { + printf("arr[id=%u, offset=%u, size=%u", reg->array.id, + reg->array.offset, reg->size); + /* for ARRAY we could have null src, for example first write + * instruction.. + */ + if (reg->instr) { + printf(", _["); print_instr_name(reg->instr); printf("]"); } + printf("]"); + } else if (reg->flags & IR3_REG_SSA) { + printf("_["); + print_instr_name(reg->instr); + printf("]"); } else if (reg->flags & IR3_REG_RELATIV) { if (reg->flags & IR3_REG_HALF) printf("h"); if (reg->flags & IR3_REG_CONST) - printf("c", reg->num); + printf("c", reg->array.offset); else - printf("\x1b[0;31mr\x1b[0m (%u)", reg->num, reg->size); + printf("\x1b[0;31mr\x1b[0m (%u)", reg->array.offset, reg->size); } else { if (reg->flags & IR3_REG_HALF) printf("h"); @@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl) for (i = 0; i < instr->regs_count; i++) { struct ir3_register *reg = instr->regs[i]; printf(i ? ", " : " "); - print_reg_name(reg, !!i); + print_reg_name(reg); } if (instr->address) { @@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl) printf("]"); } - if (instr->fanin) { - printf(", fanin=_"); - printf("["); - print_instr_name(instr->fanin); - printf("]"); - } - if (instr->cp.left) { printf(", left=_"); printf("["); @@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl) if (is_meta(instr)) { if (instr->opc == OPC_META_FO) { printf(", off=%d", instr->fo.off); - } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { - printf(", aid=%d", instr->fi.aid); } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 88ca95acbbf..3c42f8e1592 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -68,25 +68,24 @@ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after * register assignment. But for us that is horrible from a scheduling * standpoint. Instead what we do is use idea of 'definer' instruction. - * Ie. the first instruction (lowest ip) to write to the array is the + * Ie. the first instruction (lowest ip) to write to the variable is the * one we consider from use/def perspective when building interference - * graph. (Other instructions which write other array elements just - * define the variable some more.) + * graph. (Other instructions which write other variable components + * just define the variable some more.) + * + * Arrays of arbitrary size are handled via pre-coloring a consecutive + * sequence of registers. Additional scalar (single component) reg + * names are allocated starting at ctx->class_base[total_class_count] + * (see arr->base), which are pre-colored. In the use/def graph direct + * access is treated as a single element use/def, and indirect access + * is treated as use or def of all array elements. (Only the first + * def is tracked, in case of multiple indirect writes, etc.) */ static const unsigned class_sizes[] = { 1, 2, 3, 4, 4 + 4, /* txd + 1d/2d */ 4 + 6, /* txd + 3d */ - /* temporary: until we can assign arrays, create classes so we - * can round up array to fit. NOTE with tgsi arrays should - * really all be multiples of four: - */ - 4 * 4, - 4 * 8, - 4 * 16, - 4 * 32, - }; #define class_count ARRAY_SIZE(class_sizes) @@ -265,8 +264,9 @@ struct ir3_ra_ctx { struct ir3_ra_reg_set *set; struct ra_graph *g; unsigned alloc_count; - unsigned class_alloc_count[total_class_count]; - unsigned class_base[total_class_count]; + /* one per class, plus one slot for arrays: */ + unsigned class_alloc_count[total_class_count + 1]; + unsigned class_base[total_class_count + 1]; unsigned instr_cnt; unsigned *def, *use; /* def/use table */ struct ir3_ra_instr_data *instrd; @@ -329,9 +329,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; struct ir3_instruction *d = NULL; - if (instr->fanin) - return get_definer(ctx, instr->fanin, sz, off); - if (id->defn) { *sz = id->sz; *off = id->off; @@ -485,10 +482,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* couple special cases: */ if (writes_addr(instr) || writes_pred(instr)) { id->cls = -1; - continue; + } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { + id->cls = total_class_count; + id->defn = instr; + } else { + id->defn = get_definer(ctx, instr, &id->sz, &id->off); + id->cls = size_to_class(id->sz, is_half(id->defn)); } - id->defn = get_definer(ctx, instr, &id->sz, &id->off); - id->cls = size_to_class(id->sz, is_half(id->defn)); } } @@ -518,8 +518,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* arrays which don't fit in one of the pre-defined class * sizes are pre-colored: - * - * TODO but we still need to allocate names for them, don't we?? */ if (id->cls >= 0) { instr->name = ctx->class_alloc_count[id->cls]++; @@ -531,7 +529,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) static void ra_init(struct ir3_ra_ctx *ctx) { - unsigned n; + unsigned n, base; ir3_clear_mark(ctx->ir); n = ir3_count_instructions(ctx->ir); @@ -550,11 +548,20 @@ ra_init(struct ir3_ra_ctx *ctx) * actual ra name is class_base[cls] + instr->name; */ ctx->class_base[0] = 0; - for (unsigned i = 1; i < total_class_count; i++) { + for (unsigned i = 1; i <= total_class_count; i++) { ctx->class_base[i] = ctx->class_base[i-1] + ctx->class_alloc_count[i-1]; } + /* and vreg names for array elements: */ + base = ctx->class_base[total_class_count]; + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + arr->base = base; + ctx->class_alloc_count[total_class_count] += arr->length; + base += arr->length; + } + ctx->alloc_count += ctx->class_alloc_count[total_class_count]; + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); @@ -566,6 +573,7 @@ __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) { unsigned name; debug_assert(cls >= 0); + debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ name = ctx->class_base[cls] + defn->name; debug_assert(name < ctx->alloc_count); return name; @@ -590,6 +598,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_ra_block_data *bd; unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); + void def(unsigned name, struct ir3_instruction *instr) + { + /* defined on first write: */ + if (!ctx->def[name]) + ctx->def[name] = instr->ip; + ctx->use[name] = instr->ip; + BITSET_SET(bd->def, name); + } + + void use(unsigned name, struct ir3_instruction *instr) + { + ctx->use[name] = MAX2(ctx->use[name], instr->ip); + if (!BITSET_TEST(bd->def, name)) + BITSET_SET(bd->use, name); + } + bd = rzalloc(ctx->g, struct ir3_ra_block_data); bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); @@ -601,6 +625,7 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { struct ir3_instruction *src; + struct ir3_register *reg; if (instr->regs_count == 0) continue; @@ -632,17 +657,45 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (writes_gpr(instr)) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_register *dst = instr->regs[0]; - if (id->defn == instr) { - unsigned name = ra_name(ctx, id); + if (dst->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, dst->array.id); + unsigned i; + + debug_assert(!(dst->flags & IR3_REG_PHI_SRC)); + + /* set the node class now.. in case we don't encounter + * this array dst again. From register_alloc algo's + * perspective, these are all single/scalar regs: + */ + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + ra_set_node_class(ctx->g, name, ctx->set->classes[0]); + } + + /* indirect write is treated like a write to all array + * elements, since we don't know which one is actually + * written: + */ + if (dst->flags & IR3_REG_RELATIV) { + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + def(name, instr); + } + } else { + unsigned name = arr->base + dst->array.offset; + def(name, instr); + } - ctx->def[name] = id->defn->ip; - ctx->use[name] = id->defn->ip; + } else if (id->defn == instr) { + unsigned name = ra_name(ctx, id); /* since we are in SSA at this point: */ debug_assert(!BITSET_TEST(bd->use, name)); - BITSET_SET(bd->def, name); + def(name, id->defn); if (is_half(id->defn)) { ra_set_node_class(ctx->g, name, @@ -672,12 +725,28 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) } } - foreach_ssa_src(src, instr) { - if (writes_gpr(src)) { + foreach_src(reg, instr) { + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + /* indirect read is treated like a read fromall array + * elements, since we don't know which one is actually + * read: + */ + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + use(name, instr); + } + } else { + unsigned name = arr->base + reg->array.offset; + use(name, instr); + debug_assert(reg->array.offset < arr->length); + } + } else if ((src = ssa(reg)) && writes_gpr(src)) { unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); - ctx->use[name] = MAX2(ctx->use[name], instr->ip); - if (!BITSET_TEST(bd->def, name)) - BITSET_SET(bd->use, name); + use(name, instr); } } } @@ -830,18 +899,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr) } } +/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first + * array access(es) which do not have any previous access to depend + * on from scheduling point of view + */ static void reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, struct ir3_instruction *instr) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - if (id->defn) { + struct ir3_ra_instr_data *id; + + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + unsigned name = arr->base + reg->array.offset; + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r]; + + if (reg->flags & IR3_REG_RELATIV) { + reg->array.offset = num; + } else { + reg->num = num; + } + + reg->flags &= ~IR3_REG_ARRAY; + } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { unsigned name = ra_name(ctx, id); unsigned r = ra_get_node_reg(ctx->g, name); unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; - if (reg->flags & IR3_REG_RELATIV) - num += reg->offset; + debug_assert(!(reg->flags & IR3_REG_RELATIV)); reg->num = num; reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); @@ -868,9 +955,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) foreach_src_n(reg, n, instr) { struct ir3_instruction *src = reg->instr; - if (!src) + /* Note: reg->instr could be null for IR3_REG_ARRAY */ + if (!(src || (reg->flags & IR3_REG_ARRAY))) continue; - reg_assign(ctx, instr->regs[n+1], src); if (instr->regs[n+1]->flags & IR3_REG_HALF) fixup_half_instr_src(instr); @@ -881,6 +968,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) static int ra_alloc(struct ir3_ra_ctx *ctx) { + unsigned n = 0; + /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ @@ -898,7 +987,8 @@ ra_alloc(struct ir3_ra_ctx *ctx) i += 4; } - for (j = 0; i < ir->ninputs; i++) { + j = 0; + for (; i < ir->ninputs; i++) { struct ir3_instruction *instr = ir->inputs[i]; if (instr) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; @@ -914,6 +1004,24 @@ ra_alloc(struct ir3_ra_ctx *ctx) } } } + n = j; + } + + /* pre-assign array elements: + * TODO we could be a bit more clever if we knew which arrays didn't + * fully (partially?) conflict with each other.. + */ + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + unsigned i; + for (i = 0; i < arr->length; i++) { + unsigned name, reg; + + name = arr->base + i; + reg = ctx->set->gpr_to_ra_reg[0][n++]; + + ra_set_node_reg(ctx->g, name, reg); + + } } if (!ra_allocate(ctx->g)) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 6aaa16edbfe..8f640febc5d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -187,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) foreach_ssa_src_n(src, i, instr) { unsigned d; + /* for array writes, no need to delay on previous write: */ + if (i == 0) + continue; if (src->block != instr->block) continue; d = delay_calc_srcn(ctx, src, instr, i); -- 2.30.2