X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Ffreedreno%2Fir3%2Fir3_compiler_nir.c;h=fd4a1d6eccebf4010c1ddfd8e8a8579c5007f4c8;hb=762a6333f21fd8606f69db6060027c4522d46678;hp=1794059342b3fdb728c0b6ee2735202eb889d6d2;hpb=7d539080c1a491aff9fb3e90c25df89884477aa8;p=mesa.git diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 1794059342b..fd4a1d6ecce 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -32,10 +32,6 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "tgsi/tgsi_lowering.h" -#include "tgsi/tgsi_strings.h" - -#include "nir/tgsi_to_nir.h" #include "freedreno_util.h" @@ -50,7 +46,6 @@ struct ir3_compile { struct ir3_compiler *compiler; - const struct tgsi_token *tokens; struct nir_shader *s; struct ir3 *ir; @@ -79,8 +74,6 @@ struct ir3_compile { /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; - /* mapping from nir_variable to ir3_array: */ - struct hash_table *var_ht; unsigned num_arrays; /* a common pattern for indirect addressing is to request the @@ -110,8 +103,15 @@ struct ir3_compile { */ bool unminify_coords; - /* for looking up which system value is which */ - unsigned sysval_semantics[8]; + /* on a4xx, for array textures we need to add 0.5 to the array + * index coordinate: + */ + bool array_index_add_half; + + /* on a4xx, bitmask of samplers which need astc+srgb workaround: */ + unsigned astc_srgb; + + unsigned max_texture_index; /* set if we encounter something we can't handle yet, so we * can bail cleanly and fallback to TGSI compiler f/e @@ -119,101 +119,19 @@ struct ir3_compile { bool error; }; +/* gpu pointer size in units of 32bit registers/slots */ +static unsigned pointer_size(struct ir3_compile *ctx) +{ + return (ctx->compiler->gpu_id >= 500) ? 2 : 1; +} static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock); -static struct nir_shader *to_nir(struct ir3_compile *ctx, - const struct tgsi_token *tokens, struct ir3_shader_variant *so) -{ - static const nir_shader_compiler_options options = { - .lower_fpow = true, - .lower_fsat = true, - .lower_scmp = true, - .lower_flrp = true, - .lower_ffract = true, - .native_integers = true, - }; - struct nir_lower_tex_options tex_options = { - .lower_rect = 0, - }; - bool progress; - - switch (so->type) { - case SHADER_FRAGMENT: - case SHADER_COMPUTE: - tex_options.saturate_s = so->key.fsaturate_s; - tex_options.saturate_t = so->key.fsaturate_t; - tex_options.saturate_r = so->key.fsaturate_r; - break; - case SHADER_VERTEX: - tex_options.saturate_s = so->key.vsaturate_s; - tex_options.saturate_t = so->key.vsaturate_t; - tex_options.saturate_r = so->key.vsaturate_r; - break; - } - - if (ctx->compiler->gpu_id >= 400) { - /* a4xx seems to have *no* sam.p */ - tex_options.lower_txp = ~0; /* lower all txp */ - } else { - /* a3xx just needs to avoid sam.p for 3d tex */ - tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); - } - - struct nir_shader *s = tgsi_to_nir(tokens, &options); - - if (fd_mesa_debug & FD_DBG_DISASM) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - nir_opt_global_to_local(s); - nir_convert_to_ssa(s); - if (s->stage == MESA_SHADER_VERTEX) { - nir_lower_clip_vs(s, so->key.ucp_enables); - } else if (s->stage == MESA_SHADER_FRAGMENT) { - nir_lower_clip_fs(s, so->key.ucp_enables); - } - nir_lower_tex(s, &tex_options); - if (so->key.color_two_side) - nir_lower_two_sided_color(s); - nir_lower_idiv(s); - nir_lower_load_const_to_scalar(s); - - do { - progress = false; - - nir_lower_vars_to_ssa(s); - nir_lower_alu_to_scalar(s); - nir_lower_phis_to_scalar(s); - - progress |= nir_copy_prop(s); - progress |= nir_opt_dce(s); - progress |= nir_opt_cse(s); - progress |= ir3_nir_lower_if_else(s); - progress |= nir_opt_algebraic(s); - progress |= nir_opt_constant_folding(s); - - } while (progress); - - nir_remove_dead_variables(s); - nir_validate_shader(s); - - if (fd_mesa_debug & FD_DBG_DISASM) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - return s; -} static struct ir3_compile * compile_init(struct ir3_compiler *compiler, - struct ir3_shader_variant *so, - const struct tgsi_token *tokens) + struct ir3_shader_variant *so) { struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile); @@ -222,11 +140,19 @@ compile_init(struct ir3_compiler *compiler, ctx->flat_bypass = true; ctx->levels_add_one = false; ctx->unminify_coords = false; + ctx->array_index_add_half = true; + + if (so->type == SHADER_VERTEX) + ctx->astc_srgb = so->key.vastc_srgb; + else if (so->type == SHADER_FRAGMENT) + ctx->astc_srgb = so->key.fastc_srgb; + } else { /* no special handling for "flat" */ ctx->flat_bypass = false; ctx->levels_add_one = true; ctx->unminify_coords = true; + ctx->array_index_add_half = false; } ctx->compiler = compiler; @@ -234,38 +160,73 @@ compile_init(struct ir3_compiler *compiler, ctx->so = so; ctx->def_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->var_ht = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); - ctx->s = to_nir(ctx, tokens, so); + /* TODO: maybe generate some sort of bitmask of what key + * lowers vs what shader has (ie. no need to lower + * texture clamp lowering if no texture sample instrs).. + * although should be done further up the stack to avoid + * creating duplicate variants.. + */ - so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; + if (ir3_key_lowers_nir(&so->key)) { + nir_shader *s = nir_shader_clone(ctx, so->shader->nir); + ctx->s = ir3_optimize_nir(so->shader, s, &so->key); + } else { + /* fast-path for shader key that lowers nothing in NIR: */ + ctx->s = so->shader->nir; + } - /* Layout of constant registers: + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}", + so->shader->id, so->id, so->type, + so->key.binning_pass, so->key.color_two_side, + so->key.half_precision); + nir_print_shader(ctx->s, stdout); + } + + so->num_uniforms = ctx->s->num_uniforms; + so->num_ubos = ctx->s->info->num_ubos; + + /* Layout of constant registers, each section aligned to vec4. Note + * that pointer size (ubo, etc) changes depending on generation. * - * num_uniform * vec4 - user consts - * 4 * vec4 - UBO addresses + * user consts + * UBO addresses * if (vertex shader) { - * N * vec4 - driver params (IR3_DP_*) - * 1 * vec4 - stream-out addresses + * driver params (IR3_DP_*) + * if (stream_output.num_outputs > 0) + * stream-out addresses * } + * immediates * - * TODO this could be made more dynamic, to at least skip sections - * that we don't need.. + * Immediates go last mostly because they are inserted in the CP pass + * after the nir -> ir3 frontend. */ + unsigned constoff = align(ctx->s->num_uniforms, 4); + unsigned ptrsz = pointer_size(ctx); + + memset(&so->constbase, ~0, sizeof(so->constbase)); - /* reserve 4 (vec4) slots for ubo base addresses: */ - so->first_immediate += 4; + if (so->num_ubos > 0) { + so->constbase.ubo = constoff; + constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4; + } if (so->type == SHADER_VERTEX) { - /* driver params (see ir3_driver_param): */ - so->first_immediate += IR3_DP_COUNT/4; /* convert to vec4 */ - /* one (vec4) slot for stream-output base addresses: */ - so->first_immediate++; + so->constbase.driver_param = constoff; + constoff += align(IR3_DP_COUNT, 4) / 4; + + if ((compiler->gpu_id < 500) && + so->shader->stream_output.num_outputs > 0) { + so->constbase.tfbo = constoff; + constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4; + } } + so->constbase.immediate = constoff; + return ctx; } @@ -291,206 +252,26 @@ compile_free(struct ir3_compile *ctx) ralloc_free(ctx); } -/* global per-array information: */ -struct ir3_array { - unsigned length, aid; -}; - -/* per-block array state: */ -struct ir3_array_value { - /* TODO drop length/aid, and just have ptr back to ir3_array */ - unsigned length, aid; - /* initial array element values are phi's, other than for the - * entry block. The phi src's get added later in a resolve step - * after we have visited all the blocks, to account for back - * edges in the cfg. - */ - struct ir3_instruction **phis; - /* current array element values (as block is processed). When - * the array phi's are resolved, it will contain the array state - * at exit of block, so successor blocks can use it to add their - * phi srcs. - */ - struct ir3_instruction *arr[]; -}; - -/* track array assignments per basic block. When an array is read - * outside of the same basic block, we can use NIR's dominance-frontier - * information to figure out where phi nodes are needed. - */ -struct ir3_nir_block_data { - unsigned foo; - /* indexed by array-id (aid): */ - struct ir3_array_value *arrs[]; -}; - -static struct ir3_nir_block_data * -get_block_data(struct ir3_compile *ctx, struct ir3_block *block) -{ - if (!block->data) { - struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) + - ((ctx->num_arrays + 1) * sizeof(bd->arrs[0]))); - block->data = bd; - } - return block->data; -} - static void declare_var(struct ir3_compile *ctx, nir_variable *var) { unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ - struct ir3_array *arr = ralloc(ctx, struct ir3_array); + struct ir3_array *arr = rzalloc(ctx, struct ir3_array); + arr->id = ++ctx->num_arrays; arr->length = length; - arr->aid = ++ctx->num_arrays; - _mesa_hash_table_insert(ctx->var_ht, var, arr); -} - -static nir_block * -nir_block_pred(nir_block *block) -{ - assert(block->predecessors->entries < 2); - if (block->predecessors->entries == 0) - return NULL; - return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key; + arr->var = var; + list_addtail(&arr->node, &ctx->ir->array_list); } -static struct ir3_array_value * +static struct ir3_array * get_var(struct ir3_compile *ctx, nir_variable *var) { - struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); - struct ir3_block *block = ctx->block; - struct ir3_nir_block_data *bd = get_block_data(ctx, block); - struct ir3_array *arr = entry->data; - - if (!bd->arrs[arr->aid]) { - struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) + - (arr->length * sizeof(av->arr[0]))); - struct ir3_array_value *defn = NULL; - nir_block *pred_block; - - av->length = arr->length; - av->aid = arr->aid; - - /* For loops, we have to consider that we have not visited some - * of the blocks who should feed into the phi (ie. back-edges in - * the cfg).. for example: - * - * loop { - * block { load_var; ... } - * if then block {} else block {} - * block { store_var; ... } - * if then block {} else block {} - * block {...} - * } - * - * We can skip the phi if we can chase the block predecessors - * until finding the block previously defining the array without - * crossing a block that has more than one predecessor. - * - * Otherwise create phi's and resolve them as a post-pass after - * all the blocks have been visited (to handle back-edges). - */ - - for (pred_block = block->nblock; - pred_block && (pred_block->predecessors->entries < 2) && !defn; - pred_block = nir_block_pred(pred_block)) { - struct ir3_block *pblock = get_block(ctx, pred_block); - struct ir3_nir_block_data *pbd = pblock->data; - if (!pbd) - continue; - defn = pbd->arrs[arr->aid]; - } - - if (defn) { - /* only one possible definer: */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = defn->arr[i]; - } else if (pred_block) { - /* not the first block, and multiple potential definers: */ - av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0])); - - for (unsigned i = 0; i < arr->length; i++) { - struct ir3_instruction *phi; - - phi = ir3_instr_create2(block, -1, OPC_META_PHI, - 1 + ctx->impl->num_blocks); - ir3_reg_create(phi, 0, 0); /* dst */ - - /* phi's should go at head of block: */ - list_delinit(&phi->node); - list_add(&phi->node, &block->instr_list); - - av->phis[i] = av->arr[i] = phi; - } - } else { - /* Some shaders end up reading array elements without - * first writing.. so initialize things to prevent null - * instr ptrs later: - */ - for (unsigned i = 0; i < arr->length; i++) - av->arr[i] = create_immed(block, 0); - } - - bd->arrs[arr->aid] = av; - } - - return bd->arrs[arr->aid]; -} - -static void -add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock, - struct ir3_array_value *av, BITSET_WORD *visited) -{ - struct ir3_block *block; - struct ir3_nir_block_data *bd; - - if (BITSET_TEST(visited, nblock->index)) - return; - - BITSET_SET(visited, nblock->index); - - block = get_block(ctx, nblock); - bd = block->data; - - if (bd && bd->arrs[av->aid]) { - struct ir3_array_value *dav = bd->arrs[av->aid]; - for (unsigned i = 0; i < av->length; i++) { - ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr = - dav->arr[i]; - } - } else { - /* didn't find defn, recurse predecessors: */ - struct set_entry *entry; - set_foreach(nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } - } -} - -static void -resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block) -{ - struct ir3_nir_block_data *bd = block->data; - unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks); - - if (!bd) - return; - - /* TODO use nir dom_frontier to help us with this? */ - - for (unsigned i = 1; i <= ctx->num_arrays; i++) { - struct ir3_array_value *av = bd->arrs[i]; - BITSET_WORD visited[bitset_words]; - struct set_entry *entry; - - if (!(av && av->phis)) - continue; - - memset(visited, 0, sizeof(visited)); - set_foreach(block->nblock->predecessors, entry) { - add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited); - } + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + if (arr->var == var) + return arr; } + compile_error(ctx, "bogus var: %s\n", var->name); + return NULL; } /* allocate a n element value array (to be populated by caller) and @@ -508,6 +289,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n) static struct ir3_instruction ** get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n) { + compile_assert(ctx, dst->is_ssa); if (dst->is_ssa) { return __get_dst(ctx, &dst->ssa, n); } else { @@ -521,10 +303,11 @@ get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n) return __get_dst(ctx, dst, n); } -static struct ir3_instruction ** +static struct ir3_instruction * const * get_src(struct ir3_compile *ctx, nir_src *src) { struct hash_entry *entry; + compile_assert(ctx, src->is_ssa); if (src->is_ssa) { entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); } else { @@ -539,7 +322,7 @@ create_immed(struct ir3_block *block, uint32_t val) { struct ir3_instruction *mov; - mov = ir3_instr_create(block, 1, 0); + mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); @@ -619,7 +402,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n) { struct ir3_instruction *mov; - mov = ir3_instr_create(ctx->block, 1, 0); + mov = ir3_instr_create(ctx->block, OPC_MOV); /* TODO get types right? */ mov->cat1.src_type = TYPE_F32; mov->cat1.dst_type = TYPE_F32; @@ -630,16 +413,16 @@ create_uniform(struct ir3_compile *ctx, unsigned n) } static struct ir3_instruction * -create_uniform_indirect(struct ir3_compile *ctx, unsigned n, +create_uniform_indirect(struct ir3_compile *ctx, int n, struct ir3_instruction *address) { struct ir3_instruction *mov; - mov = ir3_instr_create(ctx->block, 1, 0); + mov = ir3_instr_create(ctx->block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); - ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); + ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; ir3_instr_set_address(mov, address); @@ -655,7 +438,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, if (arrsz == 0) return NULL; - collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz); + collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz); ir3_reg_create(collect, 0, 0); /* dst */ for (unsigned i = 0; i < arrsz; i++) ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i]; @@ -664,47 +447,79 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr, } static struct ir3_instruction * -create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, +create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n, struct ir3_instruction *address, struct ir3_instruction *collect) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; struct ir3_register *src; - mov = ir3_instr_create(block, 1, 0); + mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; ir3_reg_create(mov, 0, 0); src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); src->instr = collect; src->size = arrsz; - src->offset = n; + src->array.offset = n; ir3_instr_set_address(mov, address); return mov; } +/* relative (indirect) if address!=NULL */ +static struct ir3_instruction * +create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *address) +{ + struct ir3_block *block = ctx->block; + struct ir3_instruction *mov; + struct ir3_register *src; + + mov = ir3_instr_create(block, OPC_MOV); + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + ir3_reg_create(mov, 0, 0); + src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + src->instr = arr->last_write; + src->size = arr->length; + src->array.id = arr->id; + src->array.offset = n; + + if (address) + ir3_instr_set_address(mov, address); + + arr->last_access = mov; + + return mov; +} + +/* relative (indirect) if address!=NULL */ static struct ir3_instruction * -create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, - struct ir3_instruction *src, struct ir3_instruction *address, - struct ir3_instruction *collect) +create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n, + struct ir3_instruction *src, struct ir3_instruction *address) { struct ir3_block *block = ctx->block; struct ir3_instruction *mov; struct ir3_register *dst; - mov = ir3_instr_create(block, 1, 0); + mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; - dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV); - dst->size = arrsz; - dst->offset = n; + dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + COND(address, IR3_REG_RELATIV)); + dst->instr = arr->last_access; + dst->size = arr->length; + dst->array.id = arr->id; + dst->array.offset = n; ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; - mov->fanin = collect; ir3_instr_set_address(mov, address); + arr->last_write = arr->last_access = mov; + return mov; } @@ -713,7 +528,7 @@ create_input(struct ir3_block *block, unsigned n) { struct ir3_instruction *in; - in = ir3_instr_create(block, -1, OPC_META_INPUT); + in = ir3_instr_create(block, OPC_META_INPUT); in->inout.block = block; ir3_reg_create(in, n, 0); @@ -777,50 +592,12 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp) } } -static struct ir3_instruction * -create_frag_face(struct ir3_compile *ctx, unsigned comp) -{ - struct ir3_block *block = ctx->block; - struct ir3_instruction *instr; - - switch (comp) { - case 0: /* .x */ - compile_assert(ctx, !ctx->frag_face); - - ctx->frag_face = create_input(block, 0); - ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; - - /* for faceness, we always get -1 or 0 (int).. but TGSI expects - * positive vs negative float.. and piglit further seems to - * expect -1.0 or 1.0: - * - * mul.s tmp, hr0.x, 2 - * add.s tmp, tmp, 1 - * mov.s32f32, dst, tmp - * - */ - instr = ir3_MUL_S(block, ctx->frag_face, 0, - create_immed(block, 2), 0); - instr = ir3_ADD_S(block, instr, 0, - create_immed(block, 1), 0); - instr = ir3_COV(block, instr, TYPE_S32, TYPE_F32); - - return instr; - case 1: /* .y */ - case 2: /* .z */ - return create_immed(block, fui(0.0)); - default: - case 3: /* .w */ - return create_immed(block, fui(1.0)); - } -} - static struct ir3_instruction * create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) { /* first four vec4 sysval's reserved for UBOs: */ /* NOTE: dp is in scalar, but there can be >4 dp components: */ - unsigned n = ctx->so->first_driver_param + IR3_DRIVER_PARAM_OFF; + unsigned n = ctx->so->constbase.driver_param; unsigned r = regid(n + dp / 4, dp % 4); return create_uniform(ctx, r); } @@ -830,15 +607,14 @@ create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) */ static void split_dest(struct ir3_block *block, struct ir3_instruction **dst, - struct ir3_instruction *src, unsigned n) + struct ir3_instruction *src, unsigned base, unsigned n) { struct ir3_instruction *prev = NULL; for (int i = 0, j = 0; i < n; i++) { - struct ir3_instruction *split = - ir3_instr_create(block, -1, OPC_META_FO); + struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO); ir3_reg_create(split, 0, IR3_REG_SSA); ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src; - split->fo.off = i; + split->fo.off = i + base; if (prev) { split->cp.left = prev; @@ -848,7 +624,7 @@ split_dest(struct ir3_block *block, struct ir3_instruction **dst, } prev = split; - if (src->regs[0]->wrmask & (1 << i)) + if (src->regs[0]->wrmask & (1 << (i + base))) dst[j++] = split; } } @@ -946,16 +722,16 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu) } switch (alu->op) { - case nir_op_f2i: + case nir_op_f2i32: dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32); break; - case nir_op_f2u: + case nir_op_f2u32: dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32); break; - case nir_op_i2f: + case nir_op_i2f32: dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32); break; - case nir_op_u2f: + case nir_op_u2f32: dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32); break; case nir_op_imov: @@ -1217,24 +993,31 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { struct ir3_block *b = ctx->block; - struct ir3_instruction *addr, *src0, *src1; + struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; nir_const_value *const_offset; /* UBO addresses are the first driver params: */ - unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0); - unsigned off = intr->const_index[0]; + unsigned ubo = regid(ctx->so->constbase.ubo, 0); + const unsigned ptrsz = pointer_size(ctx); + + int off = 0; /* First src is ubo index, which could either be an immed or not: */ src0 = get_src(ctx, &intr->src[0])[0]; if (is_same_type_mov(src0) && (src0->regs[1]->flags & IR3_REG_IMMED)) { - addr = create_uniform(ctx, ubo + src0->regs[1]->iim_val); + base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz)); + base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1); } else { - addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0)); + base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0)); + base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0)); } + /* note: on 32bit gpu's base_hi is ignored and DCE'd */ + addr = base_lo; + const_offset = nir_src_as_const_value(intr->src[1]); if (const_offset) { - off += const_offset->u[0]; + off += const_offset->u32[0]; } else { /* For load_ubo_indirect, second src is indirect offset: */ src1 = get_src(ctx, &intr->src[1])[0]; @@ -1253,6 +1036,20 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, off -= off2; } + if (ptrsz == 2) { + struct ir3_instruction *carry; + + /* handle 32b rollover, ie: + * if (addr < base_lo) + * base_hi++ + */ + carry = ir3_CMPS_U(b, addr, 0, base_lo, 0); + carry->cat2.condition = IR3_COND_LT; + base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0); + + addr = create_collect(b, (struct ir3_instruction*[]){ addr, base_hi }, 2); + } + for (int i = 0; i < intr->num_components; i++) { struct ir3_instruction *load = ir3_LDG(b, addr, 0, create_immed(b, 1), 0); @@ -1264,12 +1061,12 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array reads: */ static void -emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, +emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); + struct ir3_array *arr = get_var(ctx, dvar->var); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1280,19 +1077,17 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = arr->arr[n]; + dst[i] = create_var_load(ctx, arr, n, NULL); } break; case nir_deref_array_type_indirect: { /* for indirect, we need to collect all the array elements: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); struct ir3_instruction *addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); for (int i = 0; i < intr->num_components; i++) { unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); - dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect); + dst[i] = create_var_load(ctx, arr, n, addr); } break; } @@ -1305,12 +1100,14 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array writes: */ static void -emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); - struct ir3_array_value *arr = get_var(ctx, dvar->var); - struct ir3_instruction **src; + struct ir3_array *arr = get_var(ctx, dvar->var); + struct ir3_instruction *addr; + struct ir3_instruction * const *src; + unsigned wrmask = nir_intrinsic_write_mask(intr); compile_assert(ctx, dvar->deref.child && (dvar->deref.child->deref_type == nir_deref_type_array)); @@ -1319,65 +1116,23 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (darr->deref_array_type) { case nir_deref_array_type_direct: - /* direct access does not require anything special: */ - for (int i = 0; i < intr->num_components; i++) { - /* ttn doesn't generate partial writemasks */ - assert(intr->const_index[0] == - (1 << intr->num_components) - 1); - - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - arr->arr[n] = src[i]; - } + addr = NULL; break; - case nir_deref_array_type_indirect: { - /* for indirect, create indirect-store and fan that out: */ - struct ir3_instruction *collect = - create_collect(ctx->block, arr->arr, arr->length); - struct ir3_instruction *addr = - get_addr(ctx, get_src(ctx, &darr->indirect)[0]); - for (int i = 0; i < intr->num_components; i++) { - /* ttn doesn't generate partial writemasks */ - assert(intr->const_index[0] == - (1 << intr->num_components) - 1); - - struct ir3_instruction *store; - unsigned n = darr->base_offset * 4 + i; - compile_assert(ctx, n < arr->length); - - store = create_indirect_store(ctx, arr->length, - n, src[i], addr, collect); - - store->fanin->fi.aid = arr->aid; - - /* TODO: probably split this out to be used for - * store_output_indirect? or move this into - * create_indirect_store()? - */ - for (int j = i; j < arr->length; j += intr->num_components) { - struct ir3_instruction *split; - - split = ir3_instr_create(ctx->block, -1, OPC_META_FO); - split->fo.off = j; - ir3_reg_create(split, 0, 0); - ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store; - - arr->arr[j] = split; - } - } - /* fixup fanout/split neighbors: */ - for (int i = 0; i < arr->length; i++) { - arr->arr[i]->cp.right = (i < (arr->length - 1)) ? - arr->arr[i+1] : NULL; - arr->arr[i]->cp.left = (i > 0) ? - arr->arr[i-1] : NULL; - } + case nir_deref_array_type_indirect: + addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); break; - } default: compile_error(ctx, "Unhandled store deref type: %u\n", darr->deref_array_type); - break; + return; + } + + for (int i = 0; i < intr->num_components; i++) { + if (!(wrmask & (1 << i))) + continue; + unsigned n = darr->base_offset * 4 + i; + compile_assert(ctx, n < arr->length); + create_var_store(ctx, arr, n, src[i], addr); } } @@ -1392,7 +1147,7 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, so->inputs[n].slot = slot; so->inputs[n].compmask = 1; so->inputs[n].regid = r; - so->inputs[n].interpolate = INTERP_QUALIFIER_FLAT; + so->inputs[n].interpolate = INTERP_MODE_FLAT; so->total_in++; ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); @@ -1400,13 +1155,14 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, } static void -emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; - struct ir3_instruction **dst, **src; + struct ir3_instruction **dst; + struct ir3_instruction * const *src; struct ir3_block *b = ctx->block; - unsigned idx = intr->const_index[0]; nir_const_value *const_offset; + int idx; if (info->has_dest) { dst = get_dst(ctx, &intr->dest, intr->num_components); @@ -1416,9 +1172,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) switch (intr->intrinsic) { case nir_intrinsic_load_uniform: + idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[0]); if (const_offset) { - idx += const_offset->u[0]; + idx += const_offset->u32[0]; for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; dst[i] = create_uniform(ctx, n); @@ -1426,7 +1183,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) } else { src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { - unsigned n = idx * 4 + i; + int n = idx * 4 + i; dst[i] = create_uniform_indirect(ctx, n, get_addr(ctx, src[0])); } @@ -1442,9 +1199,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) emit_intrinsic_load_ubo(ctx, intr, dst); break; case nir_intrinsic_load_input: + idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[0]); if (const_offset) { - idx += const_offset->u[0]; + idx += const_offset->u32[0]; for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; dst[i] = ctx->ir->inputs[n]; @@ -1462,15 +1220,16 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) } break; case nir_intrinsic_load_var: - emit_intrinisic_load_var(ctx, intr, dst); + emit_intrinsic_load_var(ctx, intr, dst); break; case nir_intrinsic_store_var: - emit_intrinisic_store_var(ctx, intr); + emit_intrinsic_store_var(ctx, intr); break; case nir_intrinsic_store_output: + idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[1]); compile_assert(ctx, const_offset != NULL); - idx += const_offset->u[0]; + idx += const_offset->u32[0]; src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { @@ -1487,27 +1246,45 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) dst[0] = ctx->basevertex; break; case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_vertex_id: if (!ctx->vertex_id) { - ctx->vertex_id = create_input(ctx->block, 0); - add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, - ctx->vertex_id); + gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ? + SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; + ctx->vertex_id = create_input(b, 0); + add_sysval_input(ctx, sv, ctx->vertex_id); } dst[0] = ctx->vertex_id; break; case nir_intrinsic_load_instance_id: if (!ctx->instance_id) { - ctx->instance_id = create_input(ctx->block, 0); + ctx->instance_id = create_input(b, 0); add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, ctx->instance_id); } dst[0] = ctx->instance_id; break; case nir_intrinsic_load_user_clip_plane: + idx = nir_intrinsic_ucp_id(intr); for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); } break; + case nir_intrinsic_load_front_face: + if (!ctx->frag_face) { + ctx->so->frag_face = true; + ctx->frag_face = create_input(b, 0); + ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; + } + /* for fragface, we always get -1 or 0, but that is inverse + * of what nir expects (where ~0 is true). Unfortunately + * trying to widen from half to full in add.s seems to do a + * non-sign-extending widen (resulting in something that + * gets interpreted as float Inf??) + */ + dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); + dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0); + break; case nir_intrinsic_discard_if: case nir_intrinsic_discard: { struct ir3_instruction *cond, *kill; @@ -1549,7 +1326,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr) struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def, instr->def.num_components); for (int i = 0; i < instr->def.num_components; i++) - dst[i] = create_immed(ctx->block, instr->value.u[i]); + dst[i] = create_immed(ctx->block, instr->value.u32[i]); } static void @@ -1612,7 +1389,8 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) { struct ir3_block *b = ctx->block; struct ir3_instruction **dst, *sam, *src0[12], *src1[4]; - struct ir3_instruction **coord, *lod, *compare, *proj, **off, **ddx, **ddy; + struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy; + struct ir3_instruction *lod, *compare, *proj; bool has_bias = false, has_lod = false, has_proj = false, has_off = false; unsigned i, coords, flags; unsigned nsrc0 = 0, nsrc1 = 0; @@ -1638,7 +1416,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) lod = get_src(ctx, &tex->src[i].src)[0]; has_lod = true; break; - case nir_tex_src_comparitor: /* shadow comparator */ + case nir_tex_src_comparator: /* shadow comparator */ compare = get_src(ctx, &tex->src[i].src)[0]; break; case nir_tex_src_projector: @@ -1656,7 +1434,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) ddy = get_src(ctx, &tex->src[i].src); break; default: - compile_error(ctx, "Unhandled NIR tex serc type: %d\n", + compile_error(ctx, "Unhandled NIR tex src type: %d\n", tex->src[i].src_type); return; } @@ -1675,24 +1453,13 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) case nir_texop_query_levels: case nir_texop_texture_samples: case nir_texop_samples_identical: + case nir_texop_txf_ms_mcs: compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); return; } tex_info(tex, &flags, &coords); - /* scale up integer coords for TXF based on the LOD */ - if (ctx->unminify_coords && (opc == OPC_ISAML)) { - assert(has_lod); - for (i = 0; i < coords; i++) - coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0); - } - - /* the array coord for cube arrays needs 0.5 added to it */ - if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array && - opc != OPC_ISAML) - coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0); - /* * lay out the first argument in the proper order: * - actual coordinates first @@ -1706,7 +1473,16 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) /* insert tex coords: */ for (i = 0; i < coords; i++) - src0[nsrc0++] = coord[i]; + src0[i] = coord[i]; + + nsrc0 = i; + + /* scale up integer coords for TXF based on the LOD */ + if (ctx->unminify_coords && (opc == OPC_ISAML)) { + assert(has_lod); + for (i = 0; i < coords; i++) + src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0); + } if (coords == 1) { /* hw doesn't do 1d, so we treat it as 2d with @@ -1719,8 +1495,15 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) if (tex->is_shadow && tex->op != nir_texop_lod) src0[nsrc0++] = compare; - if (tex->is_array && tex->op != nir_texop_lod) - src0[nsrc0++] = coord[coords]; + if (tex->is_array && tex->op != nir_texop_lod) { + struct ir3_instruction *idx = coord[coords]; + + /* the array coord for cube arrays needs 0.5 added to it */ + if (ctx->array_index_add_half && (opc != OPC_ISAML)) + idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0); + + src0[nsrc0++] = idx; + } if (has_proj) { src0[nsrc0++] = proj; @@ -1779,12 +1562,35 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) if (opc == OPC_GETLOD) type = TYPE_U32; - sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, - flags, tex->sampler_index, tex->sampler_index, - create_collect(b, src0, nsrc0), - create_collect(b, src1, nsrc1)); + unsigned tex_idx = tex->texture_index; + + ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx); + + struct ir3_instruction *col0 = create_collect(b, src0, nsrc0); + struct ir3_instruction *col1 = create_collect(b, src1, nsrc1); + + sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, flags, + tex_idx, tex_idx, col0, col1); + + if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) { + /* only need first 3 components: */ + sam->regs[0]->wrmask = 0x7; + split_dest(b, dst, sam, 0, 3); + + /* we need to sample the alpha separately with a non-ASTC + * texture state: + */ + sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_W, flags, + tex_idx, tex_idx, col0, col1); + + array_insert(ctx->ir->astc_srgb, sam); - split_dest(b, dst, sam, 4); + /* fixup .w component: */ + split_dest(b, &dst[3], sam, 3, 1); + } else { + /* normal (non-workaround) case: */ + split_dest(b, dst, sam, 0, 4); + } /* GETLOD returns results in 4.8 fixed point */ if (opc == OPC_GETLOD) { @@ -1807,12 +1613,12 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex) dst = get_dst(ctx, &tex->dest, 1); sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0, - tex->sampler_index, tex->sampler_index, NULL, NULL); + tex->texture_index, tex->texture_index, NULL, NULL); /* even though there is only one component, since it ends * up in .z rather than .x, we need a split_dest() */ - split_dest(b, dst, sam, 3); + split_dest(b, dst, sam, 0, 3); /* The # of levels comes from getinfo.z. We need to add 1 to it, since * the value in TEX_CONST_0 is zero-based. @@ -1825,7 +1631,8 @@ static void emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) { struct ir3_block *b = ctx->block; - struct ir3_instruction **dst, *sam, *lod; + struct ir3_instruction **dst, *sam; + struct ir3_instruction *lod; unsigned flags, coords; tex_info(tex, &flags, &coords); @@ -1844,9 +1651,9 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) lod = get_src(ctx, &tex->src[0].src)[0]; sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags, - tex->sampler_index, tex->sampler_index, lod, NULL); + tex->texture_index, tex->texture_index, lod, NULL); - split_dest(b, dst, sam, 4); + split_dest(b, dst, sam, 0, 4); /* Array size actually ends up in .w rather than .z. This doesn't * matter for miplevel 0, but for higher mips the value in z is @@ -1872,7 +1679,7 @@ emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi) dst = get_dst(ctx, &nphi->dest, 1); - phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI, + phi = ir3_instr_create2(ctx->block, OPC_META_PHI, 1 + exec_list_length(&nphi->srcs)); ir3_reg_create(phi, 0, 0); /* dst */ phi->phi.nphi = nphi; @@ -1892,7 +1699,7 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) nir_phi_instr *nphi; /* phi's only come at start of block: */ - if (!(is_meta(instr) && (instr->opc == OPC_META_PHI))) + if (instr->opc != OPC_META_PHI) break; if (!instr->phi.nphi) @@ -1903,11 +1710,19 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block) foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) { struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0]; + + /* NOTE: src might not be in the same block as it comes from + * according to the phi.. but in the end the backend assumes + * it will be able to assign the same register to each (which + * only works if it is assigned in the src block), so insert + * an extra mov to make sure the phi src is assigned in the + * block it comes from: + */ + src = ir3_MOV(get_block(ctx, nsrc->pred), src, TYPE_U32); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; } } - - resolve_array_phis(ctx, block); } static void @@ -1935,7 +1750,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) emit_alu(ctx, nir_instr_as_alu(instr)); break; case nir_instr_type_intrinsic: - emit_intrinisic(ctx, nir_instr_as_intrinsic(instr)); + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_load_const: emit_load_const(ctx, nir_instr_as_load_const(instr)); @@ -1954,8 +1769,6 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) case nir_texop_query_levels: emit_tex_query_levels(ctx, tex); break; - case nir_texop_samples_identical: - unreachable("nir_texop_samples_identical"); default: emit_tex(ctx, tex); break; @@ -2010,7 +1823,7 @@ emit_block(struct ir3_compile *ctx, nir_block *nblock) _mesa_hash_table_destroy(ctx->addr_ht, NULL); ctx->addr_ht = NULL; - nir_foreach_instr(nblock, instr) { + nir_foreach_instr(instr, nblock) { emit_instr(ctx, instr); if (ctx->error) return; @@ -2136,7 +1949,7 @@ emit_stream_out(struct ir3_compile *ctx) unsigned stride = strmout->stride[i]; struct ir3_instruction *base, *off; - base = create_uniform(ctx, regid(v->first_driver_param + IR3_TFBOS_OFF, i)); + base = create_uniform(ctx, regid(v->constbase.tfbo, i)); /* 24-bit should be enough: */ off = ir3_MUL_U(ctx->block, vtxcnt, 0, @@ -2170,6 +1983,8 @@ emit_stream_out(struct ir3_compile *ctx) static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { + nir_metadata_require(impl, nir_metadata_block_index); + emit_cf_list(ctx, &impl->body); emit_block(ctx, impl->end_block); @@ -2189,7 +2004,8 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl) * out, we guarantee that all exit paths flow into the stream- * out instructions. */ - if ((ctx->so->shader->stream_output.num_outputs > 0) && + if ((ctx->compiler->gpu_id < 500) && + (ctx->so->shader->stream_output.num_outputs > 0) && !ctx->so->key.binning_pass) { debug_assert(ctx->so->type == SHADER_VERTEX); emit_stream_out(ctx); @@ -2210,6 +2026,10 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) DBG("; in: slot=%u, len=%ux%u, drvloc=%u", slot, array_len, ncomp, n); + /* let's pretend things other than vec4 don't exist: */ + ncomp = MAX2(ncomp, 4); + compile_assert(ctx, ncomp == 4); + so->inputs[n].slot = slot; so->inputs[n].compmask = (1 << ncomp) - 1; so->inputs_count = MAX2(so->inputs_count, n + 1); @@ -2224,10 +2044,18 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) so->inputs[n].bary = false; so->frag_coord = true; instr = create_frag_coord(ctx, i); - } else if (slot == VARYING_SLOT_FACE) { - so->inputs[n].bary = false; - so->frag_face = true; - instr = create_frag_face(ctx, i); + } else if (slot == VARYING_SLOT_PNTC) { + /* see for example st_get_generic_varying_index().. this is + * maybe a bit mesa/st specific. But we need things to line + * up for this in fdN_program: + * unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0); + * if (emit->sprite_coord_enable & texmask) { + * ... + * } + */ + so->inputs[n].slot = VARYING_SLOT_VAR8; + so->inputs[n].bary = true; + instr = create_frag_input(ctx, false); } else { bool use_ldlv = false; @@ -2235,7 +2063,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) * we need to do flat vs smooth shading depending on * rast state: */ - if (in->data.interpolation == INTERP_QUALIFIER_NONE) { + if (in->data.interpolation == INTERP_MODE_NONE) { switch (slot) { case VARYING_SLOT_COL0: case VARYING_SLOT_COL1: @@ -2249,7 +2077,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) } if (ctx->flat_bypass) { - if ((so->inputs[n].interpolate == INTERP_QUALIFIER_FLAT) || + if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) || (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) use_ldlv = true; } @@ -2259,11 +2087,14 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) instr = create_frag_input(ctx, use_ldlv); } + compile_assert(ctx, idx < ctx->ir->ninputs); + ctx->ir->inputs[idx] = instr; } } else if (ctx->so->type == SHADER_VERTEX) { for (int i = 0; i < ncomp; i++) { unsigned idx = (n * 4) + i; + compile_assert(ctx, idx < ctx->ir->ninputs); ctx->ir->inputs[idx] = create_input(ctx->block, idx); } } else { @@ -2288,6 +2119,10 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) DBG("; out: slot=%u, len=%ux%u, drvloc=%u", slot, array_len, ncomp, n); + /* let's pretend things other than vec4 don't exist: */ + ncomp = MAX2(ncomp, 4); + compile_assert(ctx, ncomp == 4); + if (ctx->so->type == SHADER_FRAGMENT) { switch (slot) { case FRAG_RESULT_DEPTH: @@ -2318,6 +2153,7 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) case VARYING_SLOT_FOGC: case VARYING_SLOT_CLIP_DIST0: case VARYING_SLOT_CLIP_DIST1: + case VARYING_SLOT_CLIP_VERTEX: break; default: if (slot >= VARYING_SLOT_VAR0) @@ -2339,32 +2175,34 @@ setup_output(struct ir3_compile *ctx, nir_variable *out) for (int i = 0; i < ncomp; i++) { unsigned idx = (n * 4) + i; - + compile_assert(ctx, idx < ctx->ir->noutputs); ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0)); } } +static int +max_drvloc(struct exec_list *vars) +{ + int drvloc = -1; + nir_foreach_variable(var, vars) { + drvloc = MAX2(drvloc, (int)var->data.driver_location); + } + return drvloc; +} + static void emit_instructions(struct ir3_compile *ctx) { unsigned ninputs, noutputs; - nir_function_impl *fxn = NULL; + nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); - /* Find the main function: */ - nir_foreach_overload(ctx->s, overload) { - compile_assert(ctx, strcmp(overload->function->name, "main") == 0); - compile_assert(ctx, overload->impl); - fxn = overload->impl; - break; - } - - ninputs = exec_list_length(&ctx->s->inputs) * 4; - noutputs = exec_list_length(&ctx->s->outputs) * 4; + ninputs = (max_drvloc(&ctx->s->inputs) + 1) * 4; + noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4; /* or vtx shaders, we need to leave room for sysvals: */ if (ctx->so->type == SHADER_VERTEX) { - ninputs += 8; + ninputs += 16; } ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); @@ -2375,7 +2213,7 @@ emit_instructions(struct ir3_compile *ctx) list_addtail(&ctx->block->node, &ctx->ir->block_list); if (ctx->so->type == SHADER_VERTEX) { - ctx->ir->ninputs -= 8; + ctx->ir->ninputs -= 16; } /* for fragment shader, we have a single input register (usually @@ -2384,7 +2222,7 @@ emit_instructions(struct ir3_compile *ctx) if (ctx->so->type == SHADER_FRAGMENT) { // TODO maybe a helper for fi since we need it a few places.. struct ir3_instruction *instr; - instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); + instr = ir3_instr_create(ctx->block, OPC_META_FI); ir3_reg_create(instr, 0, 0); ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ @@ -2401,11 +2239,17 @@ emit_instructions(struct ir3_compile *ctx) setup_output(ctx, var); } - /* Setup variables (which should only be arrays): */ + /* Setup global variables (which should only be arrays): */ nir_foreach_variable(var, &ctx->s->globals) { declare_var(ctx, var); } + /* Setup local variables (which should only be arrays): */ + /* NOTE: need to do something more clever when we support >1 fxn */ + nir_foreach_variable(var, &fxn->locals) { + declare_var(ctx, var); + } + /* And emit the body: */ ctx->impl = fxn; emit_function(ctx, fxn); @@ -2487,6 +2331,40 @@ fixup_frag_inputs(struct ir3_compile *ctx) ir->inputs = inputs; } +/* Fixup tex sampler state for astc/srgb workaround instructions. We + * need to assign the tex state indexes for these after we know the + * max tex index. + */ +static void +fixup_astc_srgb(struct ir3_compile *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + /* indexed by original tex idx, value is newly assigned alpha sampler + * state tex idx. Zero is invalid since there is at least one sampler + * if we get here. + */ + unsigned alt_tex_state[16] = {0}; + unsigned tex_idx = ctx->max_texture_index + 1; + unsigned idx = 0; + + so->astc_srgb.base = tex_idx; + + for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) { + struct ir3_instruction *sam = ctx->ir->astc_srgb[i]; + + compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); + + if (alt_tex_state[sam->cat5.tex] == 0) { + /* assign new alternate/alpha tex state slot: */ + alt_tex_state[sam->cat5.tex] = tex_idx++; + so->astc_srgb.orig_idx[idx++] = sam->cat5.tex; + so->astc_srgb.count++; + } + + sam->cat5.tex = alt_tex_state[sam->cat5.tex]; + } +} + int ir3_compile_shader_nir(struct ir3_compiler *compiler, struct ir3_shader_variant *so) @@ -2499,7 +2377,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, assert(!so->ir); - ctx = compile_init(compiler, so, so->shader->tokens); + ctx = compile_init(compiler, so); if (!ctx) { DBG("INIT failed!"); ret = -1; @@ -2550,19 +2428,25 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, if (so->key.half_precision) { for (i = 0; i < ir->noutputs; i++) { struct ir3_instruction *out = ir->outputs[i]; + if (!out) continue; + + /* if frag shader writes z, that needs to be full precision: */ + if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH) + continue; + out->regs[0]->flags |= IR3_REG_HALF; /* output could be a fanout (ie. texture fetch output) * in which case we need to propagate the half-reg flag * up to the definer so that RA sees it: */ - if (is_meta(out) && (out->opc == OPC_META_FO)) { + if (out->opc == OPC_META_FO) { out = out->regs[1]->instr; out->regs[0]->flags |= IR3_REG_HALF; } - if (out->category == 1) { + if (out->opc == OPC_MOV) { out->cat1.dst_type = half_type(out->cat1.dst_type); } } @@ -2573,7 +2457,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } - ir3_cp(ir); + ir3_cp(ir, so); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE GROUPING:\n"); @@ -2617,21 +2501,15 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; - /* preserve hack for depth output.. tgsi writes depth to .z, - * but what we give the hw is the scalar register: - */ - if ((so->type == SHADER_FRAGMENT) && - (so->outputs[i].slot == FRAG_RESULT_DEPTH)) - so->outputs[i].regid += 2; } /* Note that some or all channels of an input may be unused: */ actual_in = 0; inloc = 0; for (i = 0; i < so->inputs_count; i++) { - unsigned j, regid = ~0, compmask = 0; + unsigned j, regid = ~0, compmask = 0, maxcomp = 0; so->inputs[i].ncomp = 0; - so->inputs[i].inloc = inloc + 8; + so->inputs[i].inloc = inloc; for (j = 0; j < 4; j++) { struct ir3_instruction *in = inputs[(i*4) + j]; if (in && !(in->flags & IR3_INSTR_UNUSED)) { @@ -2642,16 +2520,24 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) { /* assign inloc: */ assert(in->regs[1]->flags & IR3_REG_IMMED); - in->regs[1]->iim_val = inloc++; + in->regs[1]->iim_val = inloc + j; + maxcomp = j + 1; } } } - if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) + if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) { so->varying_in++; + so->inputs[i].compmask = (1 << maxcomp) - 1; + inloc += maxcomp; + } else { + so->inputs[i].compmask = compmask; + } so->inputs[i].regid = regid; - so->inputs[i].compmask = compmask; } + if (ctx->astc_srgb) + fixup_astc_srgb(ctx); + /* We need to do legalize after (for frag shader's) the "bary.f" * offsets (inloc) have been assigned. */