X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Ffreedreno%2Fir3%2Fir3_compiler_nir.c;h=34f1d9009c481856808074d451a8ed0c75f98e64;hb=d00a239b288c5df7003f30ac80d156e70f27acf7;hp=22885ff85f3539ba705fb66ee4ba906a37b8e3a4;hpb=65b2ae510bb07b75f583ecedfd59766621e1cb43;p=mesa.git diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 22885ff85f3..34f1d9009c4 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -32,11 +32,6 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "tgsi/tgsi_lowering.h" -#include "tgsi/tgsi_strings.h" - -#include "nir/tgsi_to_nir.h" -#include "glsl/shader_enums.h" #include "freedreno_util.h" @@ -51,7 +46,6 @@ struct ir3_compile { struct ir3_compiler *compiler; - const struct tgsi_token *tokens; struct nir_shader *s; struct ir3 *ir; @@ -80,8 +74,6 @@ struct ir3_compile { /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; - /* mapping from nir_variable to ir3_array: */ - struct hash_table *var_ht; unsigned num_arrays; /* a common pattern for indirect addressing is to request the @@ -97,9 +89,6 @@ struct ir3_compile { */ struct hash_table *block_ht; - /* for calculating input/output positions/linkages: */ - unsigned next_inloc; - /* a4xx (at least patchlevel 0) cannot seem to flat-interpolate * so we need to use ldlv.u32 to load the varying directly: */ @@ -114,12 +103,15 @@ struct ir3_compile { */ bool unminify_coords; - /* for looking up which system value is which */ - unsigned sysval_semantics[8]; + /* on a4xx, for array textures we need to add 0.5 to the array + * index coordinate: + */ + bool array_index_add_half; + + /* on a4xx, bitmask of samplers which need astc+srgb workaround: */ + unsigned astc_srgb; - /* list of kill instructions: */ - struct ir3_instruction *kill[16]; - unsigned int kill_count; + unsigned max_texture_index; /* set if we encounter something we can't handle yet, so we * can bail cleanly and fallback to TGSI compiler f/e @@ -131,143 +123,89 @@ struct ir3_compile { static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); -static struct nir_shader *to_nir(const struct tgsi_token *tokens) -{ - struct nir_shader_compiler_options options = { - .lower_fpow = true, - .lower_fsat = true, - .lower_scmp = true, - .lower_flrp = true, - .native_integers = true, - }; - bool progress; - - struct nir_shader *s = tgsi_to_nir(tokens, &options); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - nir_opt_global_to_local(s); - nir_convert_to_ssa(s); - nir_lower_idiv(s); - - do { - progress = false; - - nir_lower_vars_to_ssa(s); - nir_lower_alu_to_scalar(s); - nir_lower_phis_to_scalar(s); - - progress |= nir_copy_prop(s); - progress |= nir_opt_dce(s); - progress |= nir_opt_cse(s); - progress |= ir3_nir_lower_if_else(s); - progress |= nir_opt_algebraic(s); - progress |= nir_opt_constant_folding(s); - - } while (progress); - - nir_remove_dead_variables(s); - nir_validate_shader(s); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - return s; -} - -/* TODO nir doesn't lower everything for us yet, but ideally it would: */ -static const struct tgsi_token * -lower_tgsi(struct ir3_compile *ctx, const struct tgsi_token *tokens, - struct ir3_shader_variant *so) -{ - struct tgsi_shader_info info; - struct tgsi_lowering_config lconfig = { - .color_two_side = so->key.color_two_side, - .lower_FRC = true, - }; - - switch (so->type) { - case SHADER_FRAGMENT: - case SHADER_COMPUTE: - lconfig.saturate_s = so->key.fsaturate_s; - lconfig.saturate_t = so->key.fsaturate_t; - lconfig.saturate_r = so->key.fsaturate_r; - break; - case SHADER_VERTEX: - lconfig.saturate_s = so->key.vsaturate_s; - lconfig.saturate_t = so->key.vsaturate_t; - lconfig.saturate_r = so->key.vsaturate_r; - break; - } - - if (ctx->compiler->gpu_id >= 400) { - /* a4xx seems to have *no* sam.p */ - lconfig.lower_TXP = ~0; /* lower all txp */ - } else { - /* a3xx just needs to avoid sam.p for 3d tex */ - lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D); - } - - return tgsi_transform_lowering(&lconfig, tokens, &info); -} static struct ir3_compile * compile_init(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens) + struct ir3_shader_variant *so) { struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile); - const struct tgsi_token *lowered_tokens; if (compiler->gpu_id >= 400) { /* need special handling for "flat" */ ctx->flat_bypass = true; ctx->levels_add_one = false; ctx->unminify_coords = false; + ctx->array_index_add_half = true; + + if (so->type == SHADER_VERTEX) + ctx->astc_srgb = so->key.vastc_srgb; + else if (so->type == SHADER_FRAGMENT) + ctx->astc_srgb = so->key.fastc_srgb; + } else { /* no special handling for "flat" */ ctx->flat_bypass = false; ctx->levels_add_one = true; ctx->unminify_coords = true; + ctx->array_index_add_half = false; } ctx->compiler = compiler; ctx->ir = so->ir; ctx->so = so; - ctx->next_inloc = 8; ctx->def_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->var_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->addr_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - lowered_tokens = lower_tgsi(ctx, tokens, so); - if (!lowered_tokens) - lowered_tokens = tokens; - ctx->s = to_nir(lowered_tokens); + /* TODO: maybe generate some sort of bitmask of what key + * lowers vs what shader has (ie. no need to lower + * texture clamp lowering if no texture sample instrs).. + * although should be done further up the stack to avoid + * creating duplicate variants.. + */ - if (lowered_tokens != tokens) - free((void *)lowered_tokens); + if (ir3_key_lowers_nir(&so->key)) { + nir_shader *s = nir_shader_clone(ctx, so->shader->nir); + ctx->s = ir3_optimize_nir(so->shader, s, &so->key); + } else { + /* fast-path for shader key that lowers nothing in NIR: */ + ctx->s = so->shader->nir; + } + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}", + so->shader->id, so->id, so->type, + so->key.binning_pass, so->key.color_two_side, + so->key.half_precision); + nir_print_shader(ctx->s, stdout); + } so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; - /* one (vec4) slot for vertex id base: */ - if (so->type == SHADER_VERTEX) - so->first_immediate++; + /* Layout of constant registers: + * + * num_uniform * vec4 - user consts + * 4 * vec4 - UBO addresses + * if (vertex shader) { + * N * vec4 - driver params (IR3_DP_*) + * 1 * vec4 - stream-out addresses + * } + * + * TODO this could be made more dynamic, to at least skip sections + * that we don't need.. + */ /* reserve 4 (vec4) slots for ubo base addresses: */ so->first_immediate += 4; + if (so->type == SHADER_VERTEX) { + /* driver params (see ir3_driver_param): */ + so->first_immediate += IR3_DP_COUNT/4; /* convert to vec4 */ + /* one (vec4) slot for stream-output base addresses: */ + so->first_immediate++; + } + return ctx; } @@ -293,206 +231,26 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } -/* global per-array information: */ -struct ir3_array { - unsigned length, aid; -}; - -/* per-block array state: */ -struct ir3_array_value { - /* TODO drop length/aid, and just have ptr back to ir3_array */ - unsigned length, aid; - /* initial array element values are phi's, other than for the - * entry block. The phi src's get added later in a resolve step - * after we have visited all the blocks, to account for back - * edges in the cfg. - */ - struct ir3_instruction **phis; - /* current array element values (as block is processed). When - * the array phi's are resolved, it will contain the array state - * at exit of block, so successor blocks can use it to add their - * phi srcs. - */ - struct ir3_instruction *arr[]; -}; - -/* track array assignments per basic block. When an array is read - * outside of the same basic block, we can use NIR's dominance-frontier - * information to figure out where phi nodes are needed. - */ -struct ir3_nir_block_data { - unsigned foo; - /* indexed by array-id (aid): */ - struct ir3_array_value *arrs[]; -}; - -static struct ir3_nir_block_data * -get_block_data(struct ir3_compile *ctx, struct ir3_block *block) -{ - if (!block->bd) { - struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + - ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); - block->bd = bd; - } - return block->bd; -} - static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ struct ir3_array *arr = ralloc(ctx, struct ir3_array); + arr->id = ++ctx->num_arrays; arr->length = length; - arr->aid = ++ctx->num_arrays; - _mesa_hash_table_insert(ctx->var_ht, var, arr); -} - -static nir_block * -nir_block_pred(nir_block *block) -{ - assert(block->predecessors->entries < 2); - if (block->predecessors->entries == 0) - return NULL; - return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; + arr->var = var; + list_addtail(&arr->node, &ctx->ir->array_list); } -static struct ir3_array_value * +static struct ir3_array * get_var(struct ir3_compile *ctx, nir_variable *var) { - struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - struct ir3_block *block = ctx->block; - struct ir3_nir_block_data *bd = get_block_data(ctx, block); - struct ir3_array *arr = entry->data; - - if (!bd->arrs[arr->aid]) { - struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + - (arr->length * sizeof(av->arr[0]))); - struct ir3_array_value *defn = NULL; - nir_block *pred_block; - - av->length = arr->length; - av->aid = arr->aid; - - /* For loops, we have to consider that we have not visited some - * of the blocks who should feed into the phi (ie. back-edges in - * the cfg).. for example: - * - * loop { - * block { load_var; ... } - * if then block {} else block {} - * block { store_var; ... } - * if then block {} else block {} - * block {...} - * } - * - * We can skip the phi if we can chase the block predecessors - * until finding the block previously defining the array without - * crossing a block that has more than one predecessor. - * - * Otherwise create phi's and resolve them as a post-pass after - * all the blocks have been visited (to handle back-edges). - */ - - for (pred_block = block->nblock; - pred_block && (pred_block->predecessors->entries < 2) && !defn; - pred_block = nir_block_pred(pred_block)) { - struct ir3_block *pblock = get_block(ctx, pred_block); - struct ir3_nir_block_data *pbd = pblock->bd; - if (!pbd) - continue; - defn = pbd->arrs[arr->aid]; - } - - if (defn) { - /* only one possible definer: */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = defn->arr[i]; - } else if (pred_block) { - /* not the first block, and multiple potential definers: */ - av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); - - for (unsigned i = 0; i < arr->length; i++) { - struct ir3_instruction *phi; - - phi = ir3_instr_create2(block, -1, OPC_META_PHI, - 1 + ctx->impl->num_blocks); - ir3_reg_create(phi, 0, 0); /* dst */ - - /* phi's should go at head of block: */ - list_delinit(&phi->node); - list_add(&phi->node, &block->instr_list); - - av->phis[i] = av->arr[i] = phi; - } - } else { - /* Some shaders end up reading array elements without - * first writing.. so initialize things to prevent null - * instr ptrs later: - */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = create_immed(block, 0); - } - - bd->arrs[arr->aid] = av; - } - - return bd->arrs[arr->aid]; -} - -static void -add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, - struct ir3_array_value *av, BITSET_WORD *visited) -{ - struct ir3_block *block; - struct ir3_nir_block_data *bd; - - if (BITSET_TEST(visited, nblock->index)) - return; - - BITSET_SET(visited, nblock->index); - - block = get_block(ctx, nblock); - bd = block->bd; - - if (bd && bd->arrs[av->aid]) { - struct ir3_array_value *dav = bd->arrs[av->aid]; - for (unsigned i = 0; i < av->length; i++) { - ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = - dav->arr[i]; - } - } else { - /* didn't find defn, recurse predecessors: */ - struct set_entry *entry; - set_foreach(nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } - } -} - -static void -resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) -{ - struct ir3_nir_block_data *bd = block->bd; - unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); - - if (!bd) - return; - - /* TODO use nir dom_frontier to help us with this? */ - - for (unsigned i = 1; i <= ctx->num_arrays; i++) { - struct ir3_array_value *av = bd->arrs[i]; - BITSET_WORD visited[bitset_words]; - struct set_entry *entry; - - if (!(av && av->phis)) - continue; - - memset(visited, 0, sizeof(visited)); - set_foreach(block->nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + if (arr->var == var) + return arr; } + compile_error(ctx, "bogus var: %s\n", var->name); + return NULL; } /* allocate a n element value array (to be populated by caller) and @@ -510,6 +268,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n) static struct ir3_instruction ** get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n) { + compile_assert(ctx, dst->is_ssa); if (dst->is_ssa) { return __get_dst(ctx, &dst->ssa, n); } else { @@ -527,6 +286,7 @@ static struct ir3_instruction ** get_src(struct ir3_compile *ctx, nir_src *src) { struct hash_entry *entry; + compile_assert(ctx, src->is_ssa); if (src->is_ssa) { entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); } else { @@ -541,7 +301,7 @@ create_immed(struct ir3_block *block, uint32_t val) { struct ir3_instruction *mov; - mov = ir3_instr_create(block, 1, 0); + mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); @@ -583,12 +343,17 @@ static struct ir3_instruction * get_addr(struct ir3_compile *ctx, struct ir3_instruction *src) { struct ir3_instruction *addr; - struct hash_entry *entry; - entry = _mesa_hash_table_search(ctx->addr_ht, src); - if (entry) - return entry->data; - /* TODO do we need to cache per block? */ + if (!ctx->addr_ht) { + ctx->addr_ht = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); + } else { + struct hash_entry *entry; + entry = _mesa_hash_table_search(ctx->addr_ht, src); + if (entry) + return entry->data; + } + addr = create_addr(ctx->block, src); _mesa_hash_table_insert(ctx->addr_ht, src, addr); @@ -616,7 +381,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n) { struct ir3_instruction *mov; - mov = ir3_instr_create(ctx->block, 1, 0); + mov = ir3_instr_create(ctx->block, OPC_MOV); /* TODO get types right? */ mov->cat1.src_type = TYPE_F32; mov->cat1.dst_type = TYPE_F32; @@ -627,16 +392,16 @@ create_uniform(struct ir3_compile *ctx, unsigned n) } static struct ir3_instruction * -create_uniform_indirect(struct ir3_compile *ctx, unsigned n, +create_uniform_indirect(struct ir3_compile *ctx, int n, struct ir3_instruction *address) { struct ir3_instruction *mov; - mov = ir3_instr_create(ctx->block, 1, 0); + mov = ir3_instr_create(ctx->block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); + ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; ir3_instr_set_address(mov, address); @@ -652,7 +417,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, if (arrsz == 0) return NULL; - collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz); + collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz); ir3_reg_create(collect, 0, 0); /* dst */ for (unsigned i = 0; i < arrsz; i++) ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i]; @@ -661,71 +426,101 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, } static struct ir3_instruction * -create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, +create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n, struct ir3_instruction *address, struct ir3_instruction *collect) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; struct ir3_register *src; - mov = ir3_instr_create(block, 1, 0); + mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); src->instr = collect; src->size = arrsz; - src->offset = n; + src->array.offset = n; ir3_instr_set_address(mov, address); return mov; } +/* relative (indirect) if address!=NULL */ +static struct ir3_instruction * +create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, OPC_MOV); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + src->instr = arr->last_write; + src->size = arr->length; + src->array.id = arr->id; + src->array.offset = n; + + if (address) + ir3_instr_set_address(mov, address); + + arr->last_access = mov; + + return mov; +} + +/* relative (indirect) if address!=NULL */ static struct ir3_instruction * -create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, - struct ir3_instruction *src, struct ir3_instruction *address, - struct ir3_instruction *collect) +create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *src, struct ir3_instruction *address) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; struct ir3_register *dst; - mov = ir3_instr_create(block, 1, 0); + mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; - dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV); - dst->size = arrsz; - dst->offset = n; + dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + dst->instr = arr->last_access; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - mov->fanin = collect; ir3_instr_set_address(mov, address); + arr->last_write = arr->last_access = mov; + return mov; } static struct ir3_instruction * -create_input(struct ir3_block *block, struct ir3_instruction *instr, - unsigned n) +create_input(struct ir3_block *block, unsigned n) { struct ir3_instruction *in; - in = ir3_instr_create(block, -1, OPC_META_INPUT); + in = ir3_instr_create(block, OPC_META_INPUT); in->inout.block = block; ir3_reg_create(in, n, 0); - if (instr) - ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; return in; } static struct ir3_instruction * -create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv) +create_frag_input(struct ir3_compile *ctx, bool use_ldlv) { struct ir3_block *block = ctx->block; struct ir3_instruction *instr; - struct ir3_instruction *inloc = create_immed(block, n); + /* actual inloc is assigned and fixed up later: */ + struct ir3_instruction *inloc = create_immed(block, 0); if (use_ldlv) { instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); @@ -747,7 +542,7 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp) compile_assert(ctx, !ctx->frag_coord[comp]); - ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0); + ctx->frag_coord[comp] = create_input(ctx->block, 0); switch (comp) { case 0: /* .x */ @@ -776,6 +571,10 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp) } } +/* NOTE: this creates the "TGSI" style fragface (ie. input slot + * VARYING_SLOT_FACE). For NIR style nir_intrinsic_load_front_face + * we can just use the value from hw directly (since it is boolean) + */ static struct ir3_instruction * create_frag_face(struct ir3_compile *ctx, unsigned comp) { @@ -786,7 +585,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) case 0: /* .x */ compile_assert(ctx, !ctx->frag_face); - ctx->frag_face = create_input(block, NULL, 0); + ctx->frag_face = create_input(block, 0); ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; /* for faceness, we always get -1 or 0 (int).. but TGSI expects @@ -814,20 +613,29 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) } } +static struct ir3_instruction * +create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) +{ + /* first four vec4 sysval's reserved for UBOs: */ + /* NOTE: dp is in scalar, but there can be >4 dp components: */ + unsigned n = ctx->so->first_driver_param + IR3_DRIVER_PARAM_OFF; + unsigned r = regid(n + dp / 4, dp % 4); + return create_uniform(ctx, r); +} + /* helper for instructions that produce multiple consecutive scalar * outputs which need to have a split/fanout meta instruction inserted */ static void split_dest(struct ir3_block *block, struct ir3_instruction **dst, - struct ir3_instruction *src, unsigned n) + struct ir3_instruction *src, unsigned base, unsigned n) { struct ir3_instruction *prev = NULL; for (int i = 0, j = 0; i < n; i++) { - struct ir3_instruction *split = - ir3_instr_create(block, -1, OPC_META_FO); + struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO); ir3_reg_create(split, 0, IR3_REG_SSA); ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src; - split->fo.off = i; + split->fo.off = i + base; if (prev) { split->cp.left = prev; @@ -837,7 +645,7 @@ split_dest(struct ir3_block *block, struct ir3_instruction **dst, } prev = split; - if (src->regs[0]->wrmask & (1 << i)) + if (src->regs[0]->wrmask & (1 << (i + base))) dst[j++] = split; } } @@ -1166,6 +974,33 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu) dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0); break; + case nir_op_bit_count: + dst[0] = ir3_CBITS_B(b, src[0], 0); + break; + case nir_op_ifind_msb: { + struct ir3_instruction *cmp; + dst[0] = ir3_CLZ_S(b, src[0], 0); + cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0); + cmp->cat2.condition = IR3_COND_GE; + dst[0] = ir3_SEL_B32(b, + ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, + cmp, 0, dst[0], 0); + break; + } + case nir_op_ufind_msb: + dst[0] = ir3_CLZ_B(b, src[0], 0); + dst[0] = ir3_SEL_B32(b, + ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, + src[0], 0, dst[0], 0); + break; + case nir_op_find_lsb: + dst[0] = ir3_BFREV_B(b, src[0], 0); + dst[0] = ir3_CLZ_B(b, dst[0], 0); + break; + case nir_op_bitfield_reverse: + dst[0] = ir3_BFREV_B(b, src[0], 0); + break; + default: compile_error(ctx, "Unhandled ALU op: %s\n", nir_op_infos[alu->op].name); @@ -1180,9 +1015,10 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, { struct ir3_block *b = ctx->block; struct ir3_instruction *addr, *src0, *src1; + nir_const_value *const_offset; /* UBO addresses are the first driver params: */ - unsigned ubo = regid(ctx->so->first_driver_param, 0); - unsigned off = intr->const_index[0]; + unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0); + int off = 0; /* First src is ubo index, which could either be an immed or not: */ src0 = get_src(ctx, &intr->src[0])[0]; @@ -1193,7 +1029,10 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0)); } - if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) { + const_offset = nir_src_as_const_value(intr->src[1]); + if (const_offset) { + off += const_offset->u32[0]; + } else { /* For load_ubo_indirect, second src is indirect offset: */ src1 = get_src(ctx, &intr->src[1])[0]; @@ -1215,19 +1054,19 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction *load = ir3_LDG(b, addr, 0, create_immed(b, 1), 0); load->cat6.type = TYPE_U32; - load->cat6.offset = off + i * 4; /* byte offset */ + load->cat6.src_offset = off + i * 4; /* byte offset */ dst[i] = load; } } /* handles array reads: */ static void -emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, +emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); + struct ir3_array *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1238,19 +1077,17 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = arr->arr[n]; + dst[i] = create_var_load(ctx, arr, n, NULL); } break; case nir_deref_array_type_indirect: { /* for indirect, we need to collect all the array elements: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); struct ir3_instruction *addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect); + dst[i] = create_var_load(ctx, arr, n, addr); } break; } @@ -1263,12 +1100,13 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array writes: */ static void -emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); - struct ir3_instruction **src; + struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_instruction *addr, **src; + unsigned wrmask = nir_intrinsic_write_mask(intr); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1277,71 +1115,38 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (darr->deref_array_type) { case nir_deref_array_type_direct: - /* direct access does not require anything special: */ - for (int i = 0; i < intr->num_components; i++) { - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - arr->arr[n] = src[i]; - } + addr = NULL; break; - case nir_deref_array_type_indirect: { - /* for indirect, create indirect-store and fan that out: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); - struct ir3_instruction *addr = - get_addr(ctx, get_src(ctx, &darr->indirect)[0]); - for (int i = 0; i < intr->num_components; i++) { - struct ir3_instruction *store; - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - - store = create_indirect_store(ctx, arr->length, - n, src[i], addr, collect); - - store->fanin->fi.aid = arr->aid; - - /* TODO: probably split this out to be used for - * store_output_indirect? or move this into - * create_indirect_store()? - */ - for (int j = i; j < arr->length; j += intr->num_components) { - struct ir3_instruction *split; - - split = ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = j; - ir3_reg_create(split, 0, 0); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store; - - arr->arr[j] = split; - } - } - /* fixup fanout/split neighbors: */ - for (int i = 0; i < arr->length; i++) { - arr->arr[i]->cp.right = (i < (arr->length - 1)) ? - arr->arr[i+1] : NULL; - arr->arr[i]->cp.left = (i > 0) ? - arr->arr[i-1] : NULL; - } + case nir_deref_array_type_indirect: + addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); break; - } default: compile_error(ctx, "Unhandled store deref type: %u\n", darr->deref_array_type); - break; + return; + } + + for (int i = 0; i < intr->num_components; i++) { + if (!(wrmask & (1 << i))) + continue; + unsigned n = darr->base_offset * 4 + i; + compile_assert(ctx, n < arr->length); + create_var_store(ctx, arr, n, src[i], addr); } } -static void add_sysval_input(struct ir3_compile *ctx, unsigned name, +static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, struct ir3_instruction *instr) { struct ir3_shader_variant *so = ctx->so; unsigned r = regid(so->inputs_count, 0); unsigned n = so->inputs_count++; - so->inputs[n].semantic = ir3_semantic_name(name, 0); + so->inputs[n].sysval = true; + so->inputs[n].slot = slot; so->inputs[n].compmask = 1; so->inputs[n].regid = r; - so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; + so->inputs[n].interpolate = INTERP_QUALIFIER_FLAT; so->total_in++; ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); @@ -1349,12 +1154,13 @@ static void add_sysval_input(struct ir3_compile *ctx, unsigned name, } static void -emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; struct ir3_instruction **dst, **src; struct ir3_block *b = ctx->block; - unsigned idx = intr->const_index[0]; + nir_const_value *const_offset; + int idx; if (info->has_dest) { dst = get_dst(ctx, &intr->dest, intr->num_components); @@ -1364,52 +1170,65 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (intr->intrinsic) { case nir_intrinsic_load_uniform: - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_uniform(ctx, n); - } - break; - case nir_intrinsic_load_uniform_indirect: - src = get_src(ctx, &intr->src[0]); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_uniform_indirect(ctx, n, - get_addr(ctx, src[0])); + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[0]); + if (const_offset) { + idx += const_offset->u32[0]; + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_uniform(ctx, n); + } + } else { + src = get_src(ctx, &intr->src[0]); + for (int i = 0; i < intr->num_components; i++) { + int n = idx * 4 + i; + dst[i] = create_uniform_indirect(ctx, n, + get_addr(ctx, src[0])); + } + /* NOTE: if relative addressing is used, we set + * constlen in the compiler (to worst-case value) + * since we don't know in the assembler what the max + * addr reg value can be: + */ + ctx->so->constlen = ctx->s->num_uniforms; } - /* NOTE: if relative addressing is used, we set constlen in - * the compiler (to worst-case value) since we don't know in - * the assembler what the max addr reg value can be: - */ - ctx->so->constlen = ctx->s->num_uniforms; break; case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ubo_indirect: emit_intrinsic_load_ubo(ctx, intr, dst); break; case nir_intrinsic_load_input: - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = ctx->ir->inputs[n]; - } - break; - case nir_intrinsic_load_input_indirect: - src = get_src(ctx, &intr->src[0]); - struct ir3_instruction *collect = - create_collect(b, ctx->ir->inputs, ctx->ir->ninputs); - struct ir3_instruction *addr = get_addr(ctx, src[0]); - for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; - dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, - n, addr, collect); + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[0]); + if (const_offset) { + idx += const_offset->u32[0]; + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = ctx->ir->inputs[n]; + } + } else { + src = get_src(ctx, &intr->src[0]); + struct ir3_instruction *collect = + create_collect(b, ctx->ir->inputs, ctx->ir->ninputs); + struct ir3_instruction *addr = get_addr(ctx, src[0]); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, + n, addr, collect); + } } break; case nir_intrinsic_load_var: - emit_intrinisic_load_var(ctx, intr, dst); + emit_intrinsic_load_var(ctx, intr, dst); break; case nir_intrinsic_store_var: - emit_intrinisic_store_var(ctx, intr); + emit_intrinsic_store_var(ctx, intr); break; case nir_intrinsic_store_output: + idx = nir_intrinsic_base(intr); + const_offset = nir_src_as_const_value(intr->src[1]); + compile_assert(ctx, const_offset != NULL); + idx += const_offset->u32[0]; + src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; @@ -1418,30 +1237,50 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) break; case nir_intrinsic_load_base_vertex: if (!ctx->basevertex) { - /* first four vec4 sysval's reserved for UBOs: */ - unsigned r = regid(ctx->so->first_driver_param + 4, 0); - ctx->basevertex = create_uniform(ctx, r); - add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX, + ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); + add_sysval_input(ctx, SYSTEM_VALUE_BASE_VERTEX, ctx->basevertex); } dst[0] = ctx->basevertex; break; case nir_intrinsic_load_vertex_id_zero_base: if (!ctx->vertex_id) { - ctx->vertex_id = create_input(ctx->block, NULL, 0); - add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE, + ctx->vertex_id = create_input(b, 0); + add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, ctx->vertex_id); } dst[0] = ctx->vertex_id; break; case nir_intrinsic_load_instance_id: if (!ctx->instance_id) { - ctx->instance_id = create_input(ctx->block, NULL, 0); - add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID, + ctx->instance_id = create_input(b, 0); + add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, ctx->instance_id); } dst[0] = ctx->instance_id; break; + case nir_intrinsic_load_user_clip_plane: + idx = nir_intrinsic_ucp_id(intr); + for (int i = 0; i < intr->num_components; i++) { + unsigned n = idx * 4 + i; + dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); + } + break; + case nir_intrinsic_load_front_face: + if (!ctx->frag_face) { + ctx->so->frag_face = true; + ctx->frag_face = create_input(b, 0); + ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; + } + /* for fragface, we always get -1 or 0, but that is inverse + * of what nir expects (where ~0 is true). Unfortunately + * trying to widen from half to full in add.s seems to do a + * non-sign-extending widen (resulting in something that + * gets interpreted as float Inf??) + */ + dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); + dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0); + break; case nir_intrinsic_discard_if: case nir_intrinsic_discard: { struct ir3_instruction *cond, *kill; @@ -1465,7 +1304,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) kill = ir3_KILL(b, cond, 0); array_insert(ctx->ir->predicates, kill); - ctx->kill[ctx->kill_count++] = kill; + array_insert(ctx->ir->keeps, kill); ctx->so->has_kill = true; break; @@ -1483,7 +1322,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr) struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def, instr->def.num_components); for (int i = 0; i < instr->def.num_components; i++) - dst[i] = create_immed(ctx->block, instr->value.u[i]); + dst[i] = create_immed(ctx->block, instr->value.u32[i]); } static void @@ -1531,10 +1370,10 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) unreachable("bad sampler_dim"); } - if (tex->is_shadow) + if (tex->is_shadow && tex->op != nir_texop_lod) flags |= IR3_INSTR_S; - if (tex->is_array) + if (tex->is_array && tex->op != nir_texop_lod) flags |= IR3_INSTR_A; *flagsp = flags; @@ -1590,7 +1429,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) ddy = get_src(ctx, &tex->src[i].src); break; default: - compile_error(ctx, "Unhandled NIR tex serc type: %d\n", + compile_error(ctx, "Unhandled NIR tex src type: %d\n", tex->src[i].src_type); return; } @@ -1602,11 +1441,13 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) case nir_texop_txl: opc = OPC_SAML; break; case nir_texop_txd: opc = OPC_SAMGQ; break; case nir_texop_txf: opc = OPC_ISAML; break; + case nir_texop_lod: opc = OPC_GETLOD; break; case nir_texop_txf_ms: case nir_texop_txs: - case nir_texop_lod: case nir_texop_tg4: case nir_texop_query_levels: + case nir_texop_texture_samples: + case nir_texop_samples_identical: compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); return; } @@ -1620,6 +1461,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0); } + /* the array coord for cube arrays needs 0.5 added to it */ + if (ctx->array_index_add_half && tex->is_array && (opc != OPC_ISAML)) + coord[coords] = ir3_ADD_F(b, coord[coords], 0, create_immed(b, fui(0.5)), 0); + /* * lay out the first argument in the proper order: * - actual coordinates first @@ -1643,10 +1488,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) src0[nsrc0++] = create_immed(b, fui(0.5)); } - if (tex->is_shadow) + if (tex->is_shadow && tex->op != nir_texop_lod) src0[nsrc0++] = compare; - if (tex->is_array) + if (tex->is_array && tex->op != nir_texop_lod) src0[nsrc0++] = coord[coords]; if (has_proj) { @@ -1695,7 +1540,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) case nir_type_int: type = TYPE_S32; break; - case nir_type_unsigned: + case nir_type_uint: case nir_type_bool: type = TYPE_U32; break; @@ -1703,12 +1548,49 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) unreachable("bad dest_type"); } - sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, - flags, tex->sampler_index, tex->sampler_index, - create_collect(b, src0, nsrc0), - create_collect(b, src1, nsrc1)); + if (opc == OPC_GETLOD) + type = TYPE_U32; + + unsigned tex_idx = tex->texture_index; + + ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx); + + struct ir3_instruction *col0 = create_collect(b, src0, nsrc0); + struct ir3_instruction *col1 = create_collect(b, src1, nsrc1); + + sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, flags, + tex_idx, tex_idx, col0, col1); + + if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) { + /* only need first 3 components: */ + sam->regs[0]->wrmask = 0x7; + split_dest(b, dst, sam, 0, 3); + + /* we need to sample the alpha separately with a non-ASTC + * texture state: + */ + sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_W, flags, + tex_idx, tex_idx, col0, col1); + + array_insert(ctx->ir->astc_srgb, sam); + + /* fixup .w component: */ + split_dest(b, &dst[3], sam, 3, 1); + } else { + /* normal (non-workaround) case: */ + split_dest(b, dst, sam, 0, 4); + } + + /* GETLOD returns results in 4.8 fixed point */ + if (opc == OPC_GETLOD) { + struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256)); - split_dest(b, dst, sam, 4); + compile_assert(ctx, tex->dest_type == nir_type_float); + for (i = 0; i < 2; i++) { + dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0, + factor, 0); + } + } } static void @@ -1720,12 +1602,12 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex) dst = get_dst(ctx, &tex->dest, 1); sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0, - tex->sampler_index, tex->sampler_index, NULL, NULL); + tex->texture_index, tex->texture_index, NULL, NULL); /* even though there is only one component, since it ends * up in .z rather than .x, we need a split_dest() */ - split_dest(b, dst, sam, 3); + split_dest(b, dst, sam, 0, 3); /* The # of levels comes from getinfo.z. We need to add 1 to it, since * the value in TEX_CONST_0 is zero-based. @@ -1743,6 +1625,12 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) tex_info(tex, &flags, &coords); + /* Actually we want the number of dimensions, not coordinates. This + * distinction only matters for cubes. + */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) + coords = 2; + dst = get_dst(ctx, &tex->dest, 4); compile_assert(ctx, tex->num_srcs == 1); @@ -1751,9 +1639,9 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) lod = get_src(ctx, &tex->src[0].src)[0]; sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags, - tex->sampler_index, tex->sampler_index, lod, NULL); + tex->texture_index, tex->texture_index, lod, NULL); - split_dest(b, dst, sam, 4); + split_dest(b, dst, sam, 0, 4); /* Array size actually ends up in .w rather than .z. This doesn't * matter for miplevel 0, but for higher mips the value in z is @@ -1779,7 +1667,7 @@ emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi) dst = get_dst(ctx, &nphi->dest, 1); - phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI, + phi = ir3_instr_create2(ctx->block, OPC_META_PHI, 1 + exec_list_length(&nphi->srcs)); ir3_reg_create(phi, 0, 0); /* dst */ phi->phi.nphi = nphi; @@ -1799,7 +1687,7 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) nir_phi_instr *nphi; /* phi's only come at start of block: */ - if (!(is_meta(instr) && (instr->opc == OPC_META_PHI))) + if (instr->opc != OPC_META_PHI) break; if (!instr->phi.nphi) @@ -1810,11 +1698,19 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) { struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0]; + + /* NOTE: src might not be in the same block as it comes from + * according to the phi.. but in the end the backend assumes + * it will be able to assign the same register to each (which + * only works if it is assigned in the src block), so insert + * an extra mov to make sure the phi src is assigned in the + * block it comes from: + */ + src = ir3_MOV(get_block(ctx, nsrc->pred), src, TYPE_U32); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; } } - - resolve_array_phis(ctx, block); } static void @@ -1842,7 +1738,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) emit_alu(ctx, nir_instr_as_alu(instr)); break; case nir_instr_type_intrinsic: - emit_intrinisic(ctx, nir_instr_as_intrinsic(instr)); + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_load_const: emit_load_const(ctx, nir_instr_as_load_const(instr)); @@ -1911,7 +1807,11 @@ emit_block(struct ir3_compile *ctx, nir_block *nblock) ctx->block = block; list_addtail(&block->node, &ctx->ir->block_list); - nir_foreach_instr(nblock, instr) { + /* re-emit addr register in each block if needed: */ + _mesa_hash_table_destroy(ctx->addr_ht, NULL); + ctx->addr_ht = NULL; + + nir_foreach_instr(instr, nblock) { emit_instr(ctx, instr); if (ctx->error) return; @@ -1959,9 +1859,120 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list) } } +/* emit stream-out code. At this point, the current block is the original + * (nir) end block, and nir ensures that all flow control paths terminate + * into the end block. We re-purpose the original end block to generate + * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional + * block holding stream-out write instructions, followed by the new end + * block: + * + * blockOrigEnd { + * p0.x = (vtxcnt < maxvtxcnt) + * // succs: blockStreamOut, blockNewEnd + * } + * blockStreamOut { + * ... stream-out instructions ... + * // succs: blockNewEnd + * } + * blockNewEnd { + * } + */ +static void +emit_stream_out(struct ir3_compile *ctx) +{ + struct ir3_shader_variant *v = ctx->so; + struct ir3 *ir = ctx->ir; + struct pipe_stream_output_info *strmout = + &ctx->so->shader->stream_output; + struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; + struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; + struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS]; + + /* create vtxcnt input in input block at top of shader, + * so that it is seen as live over the entire duration + * of the shader: + */ + vtxcnt = create_input(ctx->in_block, 0); + add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt); + + maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); + + /* at this point, we are at the original 'end' block, + * re-purpose this block to stream-out condition, then + * append stream-out block and new-end block + */ + orig_end_block = ctx->block; + + stream_out_block = ir3_block_create(ir); + list_addtail(&stream_out_block->node, &ir->block_list); + + new_end_block = ir3_block_create(ir); + list_addtail(&new_end_block->node, &ir->block_list); + + orig_end_block->successors[0] = stream_out_block; + orig_end_block->successors[1] = new_end_block; + stream_out_block->successors[0] = new_end_block; + + /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ + cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); + cond->regs[0]->num = regid(REG_P0, 0); + cond->cat2.condition = IR3_COND_LT; + + /* condition goes on previous block to the conditional, + * since it is used to pick which of the two successor + * paths to take: + */ + orig_end_block->condition = cond; + + /* switch to stream_out_block to generate the stream-out + * instructions: + */ + ctx->block = stream_out_block; + + /* Calculate base addresses based on vtxcnt. Instructions + * generated for bases not used in following loop will be + * stripped out in the backend. + */ + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + unsigned stride = strmout->stride[i]; + struct ir3_instruction *base, *off; + + base = create_uniform(ctx, regid(v->first_driver_param + IR3_TFBOS_OFF, i)); + + /* 24-bit should be enough: */ + off = ir3_MUL_U(ctx->block, vtxcnt, 0, + create_immed(ctx->block, stride * 4), 0); + + bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); + } + + /* Generate the per-output store instructions: */ + for (unsigned i = 0; i < strmout->num_outputs; i++) { + for (unsigned j = 0; j < strmout->output[i].num_components; j++) { + unsigned c = j + strmout->output[i].start_component; + struct ir3_instruction *base, *out, *stg; + + base = bases[strmout->output[i].output_buffer]; + out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; + + stg = ir3_STG(ctx->block, base, 0, out, 0, + create_immed(ctx->block, 1), 0); + stg->cat6.type = TYPE_U32; + stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; + + array_insert(ctx->ir->keeps, stg); + } + } + + /* and finally switch to the new_end_block: */ + ctx->block = new_end_block; +} + static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { + nir_metadata_require(impl, nir_metadata_block_index); + emit_cf_list(ctx, &impl->body); emit_block(ctx, impl->end_block); @@ -1969,6 +1980,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl) * into which we emit the 'end' instruction. */ compile_assert(ctx, list_empty(&ctx->block->instr_list)); + + /* If stream-out (aka transform-feedback) enabled, emit the + * stream-out instructions, followed by a new empty block (into + * which the 'end' instruction lands). + * + * NOTE: it is done in this order, rather than inserting before + * we emit end_block, because NIR guarantees that all blocks + * flow into end_block, and that end_block has no successors. + * So by re-purposing end_block as the first block of stream- + * out, we guarantee that all exit paths flow into the stream- + * out instructions. + */ + if ((ctx->so->shader->stream_output.num_outputs > 0) && + !ctx->so->key.binning_pass) { + debug_assert(ctx->so->type == SHADER_VERTEX); + emit_stream_out(ctx); + } + ir3_END(ctx->block); } @@ -1978,90 +2007,76 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) struct ir3_shader_variant *so = ctx->so; unsigned array_len = MAX2(glsl_get_length(in->type), 1); unsigned ncomp = glsl_get_components(in->type); - /* XXX: map loc slots to semantics */ - unsigned semantic_name = in->data.location; - unsigned semantic_index = in->data.index; unsigned n = in->data.driver_location; + unsigned slot = in->data.location; - DBG("; in: %u:%u, len=%ux%u, loc=%u", - semantic_name, semantic_index, array_len, - ncomp, n); + DBG("; in: slot=%u, len=%ux%u, drvloc=%u", + slot, array_len, ncomp, n); - so->inputs[n].semantic = - ir3_semantic_name(semantic_name, semantic_index); + so->inputs[n].slot = slot; so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs[n].inloc = ctx->next_inloc; - so->inputs[n].interpolate = 0; so->inputs_count = MAX2(so->inputs_count, n + 1); + so->inputs[n].interpolate = in->data.interpolation; - /* the fdN_program_emit() code expects tgsi consts here, so map - * things back to tgsi for now: - */ - switch (in->data.interpolation) { - case INTERP_QUALIFIER_FLAT: - so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; - break; - case INTERP_QUALIFIER_NOPERSPECTIVE: - so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR; - break; - case INTERP_QUALIFIER_SMOOTH: - so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE; - break; - } - - for (int i = 0; i < ncomp; i++) { - struct ir3_instruction *instr = NULL; - unsigned idx = (n * 4) + i; + if (ctx->so->type == SHADER_FRAGMENT) { + for (int i = 0; i < ncomp; i++) { + struct ir3_instruction *instr = NULL; + unsigned idx = (n * 4) + i; - if (ctx->so->type == SHADER_FRAGMENT) { - if (semantic_name == TGSI_SEMANTIC_POSITION) { + if (slot == VARYING_SLOT_POS) { so->inputs[n].bary = false; so->frag_coord = true; instr = create_frag_coord(ctx, i); - } else if (semantic_name == TGSI_SEMANTIC_FACE) { + } else if (slot == VARYING_SLOT_FACE) { so->inputs[n].bary = false; so->frag_face = true; instr = create_frag_face(ctx, i); } else { bool use_ldlv = false; - /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR - * from the semantic name: + /* detect the special case for front/back colors where + * we need to do flat vs smooth shading depending on + * rast state: */ - if ((in->data.interpolation == INTERP_QUALIFIER_NONE) && - ((semantic_name == TGSI_SEMANTIC_COLOR) || - (semantic_name == TGSI_SEMANTIC_BCOLOR))) - so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR; + if (in->data.interpolation == INTERP_QUALIFIER_NONE) { + switch (slot) { + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + so->inputs[n].rasterflat = true; + break; + default: + break; + } + } if (ctx->flat_bypass) { - /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR - * from the semantic name: - */ - switch (so->inputs[n].interpolate) { - case TGSI_INTERPOLATE_COLOR: - if (!ctx->so->key.rasterflat) - break; - /* fallthrough */ - case TGSI_INTERPOLATE_CONSTANT: + if ((so->inputs[n].interpolate == INTERP_QUALIFIER_FLAT) || + (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) use_ldlv = true; - break; - } } so->inputs[n].bary = true; - instr = create_frag_input(ctx, - so->inputs[n].inloc + i - 8, use_ldlv); + instr = create_frag_input(ctx, use_ldlv); } - } else { - instr = create_input(ctx->block, NULL, idx); - } - ctx->ir->inputs[idx] = instr; + compile_assert(ctx, idx < ctx->ir->ninputs); + + ctx->ir->inputs[idx] = instr; + } + } else if (ctx->so->type == SHADER_VERTEX) { + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i; + compile_assert(ctx, idx < ctx->ir->ninputs); + ctx->ir->inputs[idx] = create_input(ctx->block, idx); + } + } else { + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); } if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) { - ctx->next_inloc += ncomp; so->total_in += ncomp; } } @@ -2072,56 +2087,62 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) struct ir3_shader_variant *so = ctx->so; unsigned array_len = MAX2(glsl_get_length(out->type), 1); unsigned ncomp = glsl_get_components(out->type); - /* XXX: map loc slots to semantics */ - unsigned semantic_name = out->data.location; - unsigned semantic_index = out->data.index; unsigned n = out->data.driver_location; + unsigned slot = out->data.location; unsigned comp = 0; - DBG("; out: %u:%u, len=%ux%u, loc=%u", - semantic_name, semantic_index, array_len, - ncomp, n); + DBG("; out: slot=%u, len=%ux%u, drvloc=%u", + slot, array_len, ncomp, n); - if (ctx->so->type == SHADER_VERTEX) { - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: + if (ctx->so->type == SHADER_FRAGMENT) { + switch (slot) { + case FRAG_RESULT_DEPTH: + comp = 2; /* tgsi will write to .z component */ so->writes_pos = true; break; - case TGSI_SEMANTIC_PSIZE: - so->writes_psize = true; - break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_GENERIC: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: + case FRAG_RESULT_COLOR: + so->color0_mrt = 1; break; default: - compile_error(ctx, "unknown VS semantic name: %s\n", - tgsi_semantic_names[semantic_name]); + if (slot >= FRAG_RESULT_DATA0) + break; + compile_error(ctx, "unknown FS output name: %s\n", + gl_frag_result_name(slot)); } - } else { - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: - comp = 2; /* tgsi will write to .z component */ + } else if (ctx->so->type == SHADER_VERTEX) { + switch (slot) { + case VARYING_SLOT_POS: so->writes_pos = true; break; - case TGSI_SEMANTIC_COLOR: - if (semantic_index == -1) { - semantic_index = 0; - so->color0_mrt = 1; - } + case VARYING_SLOT_PSIZ: + so->writes_psize = true; break; + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + case VARYING_SLOT_FOGC: + case VARYING_SLOT_CLIP_DIST0: + case VARYING_SLOT_CLIP_DIST1: + break; + case VARYING_SLOT_CLIP_VERTEX: + /* handled entirely in nir_lower_clip: */ + return; default: - compile_error(ctx, "unknown FS semantic name: %s\n", - tgsi_semantic_names[semantic_name]); + if (slot >= VARYING_SLOT_VAR0) + break; + if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7)) + break; + compile_error(ctx, "unknown VS output name: %s\n", + gl_varying_slot_name(slot)); } + } else { + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); } compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); - so->outputs[n].semantic = - ir3_semantic_name(semantic_name, semantic_index); + so->outputs[n].slot = slot; so->outputs[n].regid = regid(n, comp); so->outputs_count = MAX2(so->outputs_count, n + 1); @@ -2139,37 +2160,31 @@ emit_instructions(struct ir3_compile *ctx) nir_function_impl *fxn = NULL; /* Find the main function: */ - nir_foreach_overload(ctx->s, overload) { - compile_assert(ctx, strcmp(overload->function->name, "main") == 0); - compile_assert(ctx, overload->impl); - fxn = overload->impl; + nir_foreach_function(function, ctx->s) { + compile_assert(ctx, strcmp(function->name, "main") == 0); + compile_assert(ctx, function->impl); + fxn = function->impl; break; } ninputs = exec_list_length(&ctx->s->inputs) * 4; noutputs = exec_list_length(&ctx->s->outputs) * 4; - /* we need to allocate big enough outputs array so that - * we can stuff the kill's at the end. Likewise for vtx - * shaders, we need to leave room for sysvals: + /* or vtx shaders, we need to leave room for sysvals: */ - if (ctx->so->type == SHADER_FRAGMENT) { - noutputs += ARRAY_SIZE(ctx->kill); - } else if (ctx->so->type == SHADER_VERTEX) { - ninputs += 8; + if (ctx->so->type == SHADER_VERTEX) { + ninputs += 16; } ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); /* Create inputs in first block: */ - ctx->block = get_block(ctx, fxn->start_block); + ctx->block = get_block(ctx, nir_start_block(fxn)); ctx->in_block = ctx->block; list_addtail(&ctx->block->node, &ctx->ir->block_list); - if (ctx->so->type == SHADER_FRAGMENT) { - ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill); - } else if (ctx->so->type == SHADER_VERTEX) { - ctx->ir->ninputs -= 8; + if (ctx->so->type == SHADER_VERTEX) { + ctx->ir->ninputs -= 16; } /* for fragment shader, we have a single input register (usually @@ -2178,7 +2193,7 @@ emit_instructions(struct ir3_compile *ctx) if (ctx->so->type == SHADER_FRAGMENT) { // TODO maybe a helper for fi since we need it a few places.. struct ir3_instruction *instr; - instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); + instr = ir3_instr_create(ctx->block, OPC_META_FI); ir3_reg_create(instr, 0, 0); ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ @@ -2186,17 +2201,23 @@ emit_instructions(struct ir3_compile *ctx) } /* Setup inputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) { + nir_foreach_variable(var, &ctx->s->inputs) { setup_input(ctx, var); } /* Setup outputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) { + nir_foreach_variable(var, &ctx->s->outputs) { setup_output(ctx, var); } - /* Setup variables (which should only be arrays): */ - foreach_list_typed(nir_variable, var, node, &ctx->s->globals) { + /* Setup global variables (which should only be arrays): */ + nir_foreach_variable(var, &ctx->s->globals) { + declare_var(ctx, var); + } + + /* Setup local variables (which should only be arrays): */ + /* NOTE: need to do something more clever when we support >1 fxn */ + nir_foreach_variable(var, &fxn->locals) { declare_var(ctx, var); } @@ -2267,13 +2288,13 @@ fixup_frag_inputs(struct ir3_compile *ctx) so->pos_regid = regid; /* r0.x */ - instr = create_input(ctx->in_block, NULL, ir->ninputs); + instr = create_input(ctx->in_block, ir->ninputs); instr->regs[0]->num = regid++; inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[1]->instr = instr; /* r0.y */ - instr = create_input(ctx->in_block, NULL, ir->ninputs); + instr = create_input(ctx->in_block, ir->ninputs); instr->regs[0]->num = regid++; inputs[ir->ninputs++] = instr; ctx->frag_pos->regs[2]->instr = instr; @@ -2281,21 +2302,53 @@ fixup_frag_inputs(struct ir3_compile *ctx) ir->inputs = inputs; } +/* Fixup tex sampler state for astc/srgb workaround instructions. We + * need to assign the tex state indexes for these after we know the + * max tex index. + */ +static void +fixup_astc_srgb(struct ir3_compile *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + /* indexed by original tex idx, value is newly assigned alpha sampler + * state tex idx. Zero is invalid since there is at least one sampler + * if we get here. + */ + unsigned alt_tex_state[16] = {0}; + unsigned tex_idx = ctx->max_texture_index + 1; + unsigned idx = 0; + + so->astc_srgb.base = tex_idx; + + for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) { + struct ir3_instruction *sam = ctx->ir->astc_srgb[i]; + + compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); + + if (alt_tex_state[sam->cat5.tex] == 0) { + /* assign new alternate/alpha tex state slot: */ + alt_tex_state[sam->cat5.tex] = tex_idx++; + so->astc_srgb.orig_idx[idx++] = sam->cat5.tex; + so->astc_srgb.count++; + } + + sam->cat5.tex = alt_tex_state[sam->cat5.tex]; + } +} + int ir3_compile_shader_nir(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens, - struct ir3_shader_key key) + struct ir3_shader_variant *so) { struct ir3_compile *ctx; struct ir3 *ir; struct ir3_instruction **inputs; - unsigned i, j, actual_in; + unsigned i, j, actual_in, inloc; int ret = 0, max_bary; assert(!so->ir); - ctx = compile_init(compiler, so, tokens); + ctx = compile_init(compiler, so); if (!ctx) { DBG("INIT failed!"); ret = -1; @@ -2320,14 +2373,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, fixup_frag_inputs(ctx); /* at this point, for binning pass, throw away unneeded outputs: */ - if (key.binning_pass) { + if (so->key.binning_pass) { for (i = 0, j = 0; i < so->outputs_count; i++) { - unsigned name = sem2name(so->outputs[i].semantic); - unsigned idx = sem2idx(so->outputs[i].semantic); + unsigned slot = so->outputs[i].slot; /* throw away everything but first position/psize */ - if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || - (name == TGSI_SEMANTIC_PSIZE))) { + if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) { if (i != j) { so->outputs[j] = so->outputs[i]; ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0]; @@ -2345,7 +2396,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* if we want half-precision outputs, mark the output registers * as half: */ - if (key.half_precision) { + if (so->key.half_precision) { for (i = 0; i < ir->noutputs; i++) { struct ir3_instruction *out = ir->outputs[i]; if (!out) @@ -2355,32 +2406,23 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, * in which case we need to propagate the half-reg flag * up to the definer so that RA sees it: */ - if (is_meta(out) && (out->opc == OPC_META_FO)) { + if (out->opc == OPC_META_FO) { out = out->regs[1]->instr; out->regs[0]->flags |= IR3_REG_HALF; } - if (out->category == 1) { + if (out->opc == OPC_MOV) { out->cat1.dst_type = half_type(out->cat1.dst_type); } } } - /* at this point, we want the kill's in the outputs array too, - * so that they get scheduled (since they have no dst).. we've - * already ensured that the array is big enough in push_block(): - */ - if (so->type == SHADER_FRAGMENT) { - for (i = 0; i < ctx->kill_count; i++) - ir->outputs[ir->noutputs++] = ctx->kill[i]; - } - if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE CP:\n"); ir3_print(ir); } - ir3_cp(ir); + ir3_cp(ir, so); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE GROUPING:\n"); @@ -2421,13 +2463,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } - ir3_legalize(ir, &so->has_samp, &max_bary); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER LEGALIZE:\n"); - ir3_print(ir); - } - /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; @@ -2435,38 +2470,55 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, * but what we give the hw is the scalar register: */ if ((so->type == SHADER_FRAGMENT) && - (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) + (so->outputs[i].slot == FRAG_RESULT_DEPTH)) so->outputs[i].regid += 2; } /* Note that some or all channels of an input may be unused: */ actual_in = 0; + inloc = 0; for (i = 0; i < so->inputs_count; i++) { unsigned j, regid = ~0, compmask = 0; so->inputs[i].ncomp = 0; + so->inputs[i].inloc = inloc + 8; for (j = 0; j < 4; j++) { struct ir3_instruction *in = inputs[(i*4) + j]; - if (in) { + if (in && !(in->flags & IR3_INSTR_UNUSED)) { compmask |= (1 << j); regid = in->regs[0]->num - j; actual_in++; so->inputs[i].ncomp++; + if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) { + /* assign inloc: */ + assert(in->regs[1]->flags & IR3_REG_IMMED); + in->regs[1]->iim_val = inloc++; + } } } + if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) + so->varying_in++; so->inputs[i].regid = regid; so->inputs[i].compmask = compmask; } - /* fragment shader always gets full vec4's even if it doesn't - * fetch all components, but vertex shader we need to update - * with the actual number of components fetch, otherwise thing - * will hang due to mismaptch between VFD_DECODE's and - * TOTALATTRTOVS + if (ctx->astc_srgb) + fixup_astc_srgb(ctx); + + /* We need to do legalize after (for frag shader's) the "bary.f" + * offsets (inloc) have been assigned. */ + ir3_legalize(ir, &so->has_samp, &max_bary); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER LEGALIZE:\n"); + ir3_print(ir); + } + + /* Note that actual_in counts inputs that are not bary.f'd for FS: */ if (so->type == SHADER_VERTEX) so->total_in = actual_in; else - so->total_in = align(max_bary + 1, 4); + so->total_in = max_bary + 1; out: if (ret) {