X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_context.c;h=2d1ed21d9e37ee84d2589e34db64b7dbf0da65a3;hb=c2d94aa365da628fc2c7e2e9e2d35decec434719;hp=1aab7396c3e11593a4f7f8fe7dd6ce459a321d7b;hpb=c1a27ba9baf7c1d6ce15a3c9b2d9cb1eafa72918;p=mesa.git diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 1aab7396c3e..2d1ed21d9e3 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -24,8 +24,6 @@ * Rob Clark */ -#include "util/u_math.h" - #include "ir3_compiler.h" #include "ir3_context.h" #include "ir3_image.h" @@ -65,6 +63,8 @@ ir3_context_init(struct ir3_compiler *compiler, _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->block_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); + ctx->sel_cond_conversions = _mesa_hash_table_create(ctx, + _mesa_hash_pointer, _mesa_key_pointer_equal); /* TODO: maybe generate some sort of bitmask of what key * lowers vs what shader has (ie. no need to lower @@ -73,97 +73,90 @@ ir3_context_init(struct ir3_compiler *compiler, * creating duplicate variants.. */ - if (ir3_key_lowers_nir(&so->key)) { - nir_shader *s = nir_shader_clone(ctx, so->shader->nir); - ctx->s = ir3_optimize_nir(so->shader, s, &so->key); - } else { - /* fast-path for shader key that lowers nothing in NIR: */ - ctx->s = nir_shader_clone(ctx, so->shader->nir); - } + ctx->s = nir_shader_clone(ctx, so->shader->nir); + ir3_nir_lower_variant(so, ctx->s); /* this needs to be the last pass run, so do this here instead of * in ir3_optimize_nir(): */ - NIR_PASS_V(ctx->s, nir_lower_bool_to_int32); - NIR_PASS_V(ctx->s, nir_lower_locals_to_regs); - NIR_PASS_V(ctx->s, nir_convert_from_ssa, true); - - if (ir3_shader_debug & IR3_DBG_DISASM) { - DBG("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}", - so->shader->id, so->id, so->type, - so->key.color_two_side, so->key.half_precision); - nir_print_shader(ctx->s, stdout); + bool progress = false; + NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs); + + /* we could need cleanup after lower_locals_to_regs */ + while (progress) { + progress = false; + NIR_PASS(progress, ctx->s, nir_opt_algebraic); + NIR_PASS(progress, ctx->s, nir_opt_constant_folding); } - if (shader_debug_enabled(so->type)) { - fprintf(stderr, "NIR (final form) for %s shader:\n", - _mesa_shader_stage_to_string(so->type)); - nir_print_shader(ctx->s, stderr); + /* We want to lower nir_op_imul as late as possible, to catch also + * those generated by earlier passes (e.g, nir_lower_locals_to_regs). + * However, we want a final swing of a few passes to have a chance + * at optimizing the result. + */ + progress = false; + NIR_PASS(progress, ctx->s, ir3_nir_lower_imul); + while (progress) { + progress = false; + NIR_PASS(progress, ctx->s, nir_opt_algebraic); + NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars); + NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars); + NIR_PASS(progress, ctx->s, nir_opt_dce); + NIR_PASS(progress, ctx->s, nir_opt_constant_folding); } - ir3_nir_scan_driver_consts(ctx->s, &so->const_layout); - - so->num_uniforms = ctx->s->num_uniforms; - so->num_ubos = ctx->s->info.num_ubos; + /* Enable the texture pre-fetch feature only a4xx onwards. But + * only enable it on generations that have been tested: + */ + if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600)) + NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch); - ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures); + NIR_PASS_V(ctx->s, nir_convert_from_ssa, true); - /* Layout of constant registers, each section aligned to vec4. Note - * that pointer size (ubo, etc) changes depending on generation. + /* Super crude heuristic to limit # of tex prefetch in small + * shaders. This completely ignores loops.. but that's really + * not the worst of it's problems. (A frag shader that has + * loops is probably going to be big enough to not trigger a + * lower threshold.) * - * user consts - * UBO addresses - * SSBO sizes - * if (vertex shader) { - * driver params (IR3_DP_*) - * if (stream_output.num_outputs > 0) - * stream-out addresses - * } - * immediates + * 1) probably want to do this in terms of ir3 instructions + * 2) probably really want to decide this after scheduling + * (or at least pre-RA sched) so we have a rough idea about + * nops, and don't count things that get cp'd away + * 3) blob seems to use higher thresholds with a mix of more + * SFU instructions. Which partly makes sense, more SFU + * instructions probably means you want to get the real + * shader started sooner, but that considers where in the + * shader the SFU instructions are, which blob doesn't seem + * to do. * - * Immediates go last mostly because they are inserted in the CP pass - * after the nir -> ir3 frontend. + * This uses more conservative thresholds assuming a more alu + * than sfu heavy instruction mix. */ - unsigned constoff = align(ctx->s->num_uniforms, 4); - unsigned ptrsz = ir3_pointer_size(ctx); - - memset(&so->constbase, ~0, sizeof(so->constbase)); - - if (so->num_ubos > 0) { - so->constbase.ubo = constoff; - constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4; - } - - if (so->const_layout.ssbo_size.count > 0) { - unsigned cnt = so->const_layout.ssbo_size.count; - so->constbase.ssbo_sizes = constoff; - constoff += align(cnt, 4) / 4; - } + if (so->type == MESA_SHADER_FRAGMENT) { + nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); - if (so->const_layout.image_dims.count > 0) { - unsigned cnt = so->const_layout.image_dims.count; - so->constbase.image_dims = constoff; - constoff += align(cnt, 4) / 4; - } + unsigned instruction_count = 0; + nir_foreach_block (block, fxn) { + instruction_count += exec_list_length(&block->instr_list); + } - unsigned num_driver_params = 0; - if (so->type == MESA_SHADER_VERTEX) { - num_driver_params = IR3_DP_VS_COUNT; - } else if (so->type == MESA_SHADER_COMPUTE) { - num_driver_params = IR3_DP_CS_COUNT; + if (instruction_count < 50) { + ctx->prefetch_limit = 2; + } else if (instruction_count < 70) { + ctx->prefetch_limit = 3; + } else { + ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH; + } } - so->constbase.driver_param = constoff; - constoff += align(num_driver_params, 4) / 4; - - if ((so->type == MESA_SHADER_VERTEX) && - (compiler->gpu_id < 500) && - so->shader->stream_output.num_outputs > 0) { - so->constbase.tfbo = constoff; - constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; + if (shader_debug_enabled(so->type)) { + fprintf(stdout, "NIR (final form) for %s shader %s:\n", + ir3_shader_stage(so), so->shader->nir->info.name); + nir_print_shader(ctx->s, stdout); } - so->constbase.immediate = constoff; + ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures); return ctx; } @@ -228,7 +221,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src) ralloc_array(ctx, struct ir3_instruction *, num_components); if (src->reg.indirect) - addr = ir3_get_addr(ctx, ir3_get_src(ctx, src->reg.indirect)[0], + addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0], reg->num_components); for (unsigned i = 0; i < num_components; i++) { @@ -242,7 +235,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src) } void -put_dst(struct ir3_context *ctx, nir_dest *dst) +ir3_put_dst(struct ir3_context *ctx, nir_dest *dst) { unsigned bit_size = nir_dest_bit_size(*dst); @@ -251,17 +244,24 @@ put_dst(struct ir3_context *ctx, nir_dest *dst) * ir3_cp will clean up the extra mov: */ for (unsigned i = 0; i < ctx->last_dst_n; i++) { + if (!ctx->last_dst[i]) + continue; if (ctx->last_dst[i]->regs[0]->flags & IR3_REG_HIGH) { ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32); } } - if (bit_size < 32) { + /* Note: 1-bit bools are stored in 32-bit regs */ + if (bit_size == 16) { for (unsigned i = 0; i < ctx->last_dst_n; i++) { struct ir3_instruction *dst = ctx->last_dst[i]; - dst->regs[0]->flags |= IR3_REG_HALF; - if (ctx->last_dst[i]->opc == OPC_META_FO) - dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF; + ir3_set_dst_type(dst, true); + ir3_fixup_src_type(dst); + if (dst->opc == OPC_META_SPLIT) { + ir3_set_dst_type(ssa(dst->regs[1]), true); + ir3_fixup_src_type(ssa(dst->regs[1])); + dst->regs[1]->flags |= IR3_REG_HALF; + } } } @@ -272,7 +272,7 @@ put_dst(struct ir3_context *ctx, nir_dest *dst) struct ir3_instruction *addr = NULL; if (dst->reg.indirect) - addr = ir3_get_addr(ctx, ir3_get_src(ctx, dst->reg.indirect)[0], + addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0], reg->num_components); for (unsigned i = 0; i < num_components; i++) { @@ -290,6 +290,12 @@ put_dst(struct ir3_context *ctx, nir_dest *dst) ctx->last_dst_n = 0; } +static unsigned +dest_flags(struct ir3_instruction *instr) +{ + return instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH); +} + struct ir3_instruction * ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr, unsigned arrsz) @@ -300,10 +306,10 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr, if (arrsz == 0) return NULL; - unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF; + unsigned flags = dest_flags(arr[0]); - collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz); - ir3_reg_create(collect, 0, flags); /* dst */ + collect = ir3_instr_create2(block, OPC_META_COLLECT, 1 + arrsz); + __ssa_dst(collect)->flags |= flags; for (unsigned i = 0; i < arrsz; i++) { struct ir3_instruction *elem = arr[i]; @@ -336,15 +342,17 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr, elem = ir3_MOV(block, elem, type); } - compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags); - ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem; + compile_assert(ctx, dest_flags(elem) == flags); + __ssa_src(collect, elem, flags); } + collect->regs[0]->wrmask = MASK(arrsz); + return collect; } /* helper for instructions that produce multiple consecutive scalar - * outputs which need to have a split/fanout meta instruction inserted + * outputs which need to have a split meta instruction inserted */ void ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst, @@ -357,13 +365,24 @@ ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst, return; } - unsigned flags = src->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH); + if (src->opc == OPC_META_COLLECT) { + debug_assert((base + n) < src->regs_count); + + for (int i = 0; i < n; i++) { + dst[i] = ssa(src->regs[i + base + 1]); + } + + return; + } + + unsigned flags = dest_flags(src); for (int i = 0, j = 0; i < n; i++) { - struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO); - ir3_reg_create(split, 0, IR3_REG_SSA | flags); - ir3_reg_create(split, 0, IR3_REG_SSA | flags)->instr = src; - split->fo.off = i + base; + struct ir3_instruction *split = + ir3_instr_create(block, OPC_META_SPLIT); + __ssa_dst(split)->flags |= flags; + __ssa_src(split, src, flags); + split->split.off = i + base; if (prev) { split->cp.left = prev; @@ -378,7 +397,7 @@ ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst, } } -void +NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format, ...) { struct hash_table *errors = NULL; @@ -397,19 +416,15 @@ ir3_context_error(struct ir3_context *ctx, const char *format, ...) nir_print_shader_annotated(ctx->s, stdout, errors); ralloc_free(errors); ctx->error = true; - debug_assert(0); + unreachable(""); } static struct ir3_instruction * -create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) +create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align) { struct ir3_instruction *instr, *immed; - /* TODO in at least some cases, the backend could probably be - * made clever enough to propagate IR3_REG_HALF.. - */ instr = ir3_COV(block, src, TYPE_U32, TYPE_S16); - instr->regs[0]->flags |= IR3_REG_HALF; switch(align){ case 1: @@ -417,41 +432,41 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) break; case 2: /* src *= 2 => src <<= 1: */ - immed = create_immed(block, 1); - immed->regs[0]->flags |= IR3_REG_HALF; - + immed = create_immed_typed(block, 1, TYPE_S16); instr = ir3_SHL_B(block, instr, 0, immed, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; break; case 3: /* src *= 3: */ - immed = create_immed(block, 3); - immed->regs[0]->flags |= IR3_REG_HALF; - + immed = create_immed_typed(block, 3, TYPE_S16); instr = ir3_MULL_U(block, instr, 0, immed, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; break; case 4: /* src *= 4 => src <<= 2: */ - immed = create_immed(block, 2); - immed->regs[0]->flags |= IR3_REG_HALF; - + immed = create_immed_typed(block, 2, TYPE_S16); instr = ir3_SHL_B(block, instr, 0, immed, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; break; default: unreachable("bad align"); return NULL; } + instr->regs[0]->flags |= IR3_REG_HALF; + instr = ir3_MOV(block, instr, TYPE_S16); instr->regs[0]->num = regid(REG_A0, 0); - instr->regs[0]->flags |= IR3_REG_HALF; - instr->regs[1]->flags |= IR3_REG_HALF; + instr->regs[0]->flags &= ~IR3_REG_SSA; + + return instr; +} +static struct ir3_instruction * +create_addr1(struct ir3_block *block, unsigned const_val) +{ + + struct ir3_instruction *immed = create_immed_typed(block, const_val, TYPE_S16); + struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16); + instr->regs[0]->num = regid(REG_A0, 1); + instr->regs[0]->flags &= ~IR3_REG_SSA; return instr; } @@ -459,25 +474,45 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align) * sequences for each use of a given NIR level src as address */ struct ir3_instruction * -ir3_get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align) +ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align) { struct ir3_instruction *addr; unsigned idx = align - 1; - compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht)); + compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht)); - if (!ctx->addr_ht[idx]) { - ctx->addr_ht[idx] = _mesa_hash_table_create(ctx, + if (!ctx->addr0_ht[idx]) { + ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); } else { struct hash_entry *entry; - entry = _mesa_hash_table_search(ctx->addr_ht[idx], src); + entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src); if (entry) return entry->data; } - addr = create_addr(ctx->block, src, align); - _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr); + addr = create_addr0(ctx->block, src, align); + _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr); + + return addr; +} + +/* Similar to ir3_get_addr0, but for a1.x. */ +struct ir3_instruction * +ir3_get_addr1(struct ir3_context *ctx, unsigned const_val) +{ + struct ir3_instruction *addr; + + if (!ctx->addr1_ht) { + ctx->addr1_ht = _mesa_hash_table_u64_create(ctx); + } else { + addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val); + if (addr) + return addr; + } + + addr = create_addr1(ctx->block, const_val); + _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr); return addr; } @@ -494,6 +529,7 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src) /* condition always goes in predicate register: */ cond->regs[0]->num = regid(REG_P0, 0); + cond->regs[0]->flags &= ~IR3_REG_SSA; return cond; } @@ -517,13 +553,17 @@ ir3_declare_array(struct ir3_context *ctx, nir_register *reg) arr->length = reg->num_components * MAX2(1, reg->num_array_elems); compile_assert(ctx, arr->length > 0); arr->r = reg; + arr->half = reg->bit_size <= 16; + // HACK one-bit bools still end up as 32b: + if (reg->bit_size == 1) + arr->half = false; list_addtail(&arr->node, &ctx->ir->array_list); } struct ir3_array * ir3_get_array(struct ir3_context *ctx, nir_register *reg) { - list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + foreach_array (arr, &ctx->ir->array_list) { if (arr->r == reg) return arr; } @@ -539,15 +579,23 @@ ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, struct ir3_block *block = ctx->block; struct ir3_instruction *mov; struct ir3_register *src; + unsigned flags = 0; mov = ir3_instr_create(block, OPC_MOV); - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; + if (arr->half) { + mov->cat1.src_type = TYPE_U16; + mov->cat1.dst_type = TYPE_U16; + flags |= IR3_REG_HALF; + } else { + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + } + mov->barrier_class = IR3_BARRIER_ARRAY_R; mov->barrier_conflict = IR3_BARRIER_ARRAY_W; - ir3_reg_create(mov, 0, 0); + __ssa_dst(mov)->flags |= flags; src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | - COND(address, IR3_REG_RELATIV)); + COND(address, IR3_REG_RELATIV) | flags); src->instr = arr->last_write; src->size = arr->length; src->array.id = arr->id; @@ -567,11 +615,15 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, struct ir3_block *block = ctx->block; struct ir3_instruction *mov; struct ir3_register *dst; + unsigned flags = 0; /* if not relative store, don't create an extra mov, since that * ends up being difficult for cp to remove. + * + * Also, don't skip the mov if the src is meta (like fanout/split), + * since that creates a situation that RA can't really handle properly. */ - if (!address) { + if (!address && !is_meta(src)) { dst = src->regs[0]; src->barrier_class |= IR3_BARRIER_ARRAY_W; @@ -591,17 +643,24 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, } mov = ir3_instr_create(block, OPC_MOV); - mov->cat1.src_type = TYPE_U32; - mov->cat1.dst_type = TYPE_U32; + if (arr->half) { + mov->cat1.src_type = TYPE_U16; + mov->cat1.dst_type = TYPE_U16; + flags |= IR3_REG_HALF; + } else { + mov->cat1.src_type = TYPE_U32; + mov->cat1.dst_type = TYPE_U32; + } mov->barrier_class = IR3_BARRIER_ARRAY_W; mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | + flags | COND(address, IR3_REG_RELATIV)); dst->instr = arr->last_write; dst->size = arr->length; dst->array.id = arr->id; dst->array.offset = n; - ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(mov, 0, IR3_REG_SSA | flags)->instr = src; if (address) ir3_instr_set_address(mov, address);