* Rob Clark <robclark@freedesktop.org>
*/
-#include "util/u_math.h"
-
#include "ir3_compiler.h"
#include "ir3_context.h"
+#include "ir3_image.h"
#include "ir3_shader.h"
#include "ir3_nir.h"
}
}
- if (compiler->gpu_id >= 400) {
+ if (compiler->gpu_id >= 600) {
+ ctx->funcs = &ir3_a6xx_funcs;
+ } else if (compiler->gpu_id >= 400) {
ctx->funcs = &ir3_a4xx_funcs;
}
_mesa_hash_pointer, _mesa_key_pointer_equal);
ctx->block_ht = _mesa_hash_table_create(ctx,
_mesa_hash_pointer, _mesa_key_pointer_equal);
+ ctx->sel_cond_conversions = _mesa_hash_table_create(ctx,
+ _mesa_hash_pointer, _mesa_key_pointer_equal);
/* TODO: maybe generate some sort of bitmask of what key
* lowers vs what shader has (ie. no need to lower
* creating duplicate variants..
*/
- if (ir3_key_lowers_nir(&so->key)) {
- nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
- ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
- } else {
- /* fast-path for shader key that lowers nothing in NIR: */
- ctx->s = nir_shader_clone(ctx, so->shader->nir);
- }
+ ctx->s = nir_shader_clone(ctx, so->shader->nir);
+ ir3_nir_lower_variant(so, ctx->s);
/* this needs to be the last pass run, so do this here instead of
* in ir3_optimize_nir():
*/
- NIR_PASS_V(ctx->s, nir_lower_bool_to_int32);
- NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
- NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
-
- if (ir3_shader_debug & IR3_DBG_DISASM) {
- DBG("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
- so->shader->id, so->id, so->type,
- so->key.color_two_side, so->key.half_precision);
- nir_print_shader(ctx->s, stdout);
+ bool progress = false;
+ NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);
+
+ /* we could need cleanup after lower_locals_to_regs */
+ while (progress) {
+ progress = false;
+ NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+ NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
}
- if (shader_debug_enabled(so->type)) {
- fprintf(stderr, "NIR (final form) for %s shader:\n",
- _mesa_shader_stage_to_string(so->type));
- nir_print_shader(ctx->s, stderr);
+ /* We want to lower nir_op_imul as late as possible, to catch also
+ * those generated by earlier passes (e.g, nir_lower_locals_to_regs).
+ * However, we want a final swing of a few passes to have a chance
+ * at optimizing the result.
+ */
+ progress = false;
+ NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
+ while (progress) {
+ progress = false;
+ NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+ NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
+ NIR_PASS(progress, ctx->s, nir_opt_dce);
+ NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
}
- ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
+ /* Enable the texture pre-fetch feature only a4xx onwards. But
+ * only enable it on generations that have been tested:
+ */
+ if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
+ NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
- so->num_uniforms = ctx->s->num_uniforms;
- so->num_ubos = ctx->s->info.num_ubos;
+ NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
- /* Layout of constant registers, each section aligned to vec4. Note
- * that pointer size (ubo, etc) changes depending on generation.
+ /* Super crude heuristic to limit # of tex prefetch in small
+ * shaders. This completely ignores loops.. but that's really
+ * not the worst of it's problems. (A frag shader that has
+ * loops is probably going to be big enough to not trigger a
+ * lower threshold.)
*
- * user consts
- * UBO addresses
- * SSBO sizes
- * if (vertex shader) {
- * driver params (IR3_DP_*)
- * if (stream_output.num_outputs > 0)
- * stream-out addresses
- * }
- * immediates
+ * 1) probably want to do this in terms of ir3 instructions
+ * 2) probably really want to decide this after scheduling
+ * (or at least pre-RA sched) so we have a rough idea about
+ * nops, and don't count things that get cp'd away
+ * 3) blob seems to use higher thresholds with a mix of more
+ * SFU instructions. Which partly makes sense, more SFU
+ * instructions probably means you want to get the real
+ * shader started sooner, but that considers where in the
+ * shader the SFU instructions are, which blob doesn't seem
+ * to do.
*
- * Immediates go last mostly because they are inserted in the CP pass
- * after the nir -> ir3 frontend.
+ * This uses more conservative thresholds assuming a more alu
+ * than sfu heavy instruction mix.
*/
- unsigned constoff = align(ctx->s->num_uniforms, 4);
- unsigned ptrsz = ir3_pointer_size(ctx);
+ if (so->type == MESA_SHADER_FRAGMENT) {
+ nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
- memset(&so->constbase, ~0, sizeof(so->constbase));
-
- if (so->num_ubos > 0) {
- so->constbase.ubo = constoff;
- constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
- }
-
- if (so->const_layout.ssbo_size.count > 0) {
- unsigned cnt = so->const_layout.ssbo_size.count;
- so->constbase.ssbo_sizes = constoff;
- constoff += align(cnt, 4) / 4;
- }
-
- if (so->const_layout.image_dims.count > 0) {
- unsigned cnt = so->const_layout.image_dims.count;
- so->constbase.image_dims = constoff;
- constoff += align(cnt, 4) / 4;
- }
+ unsigned instruction_count = 0;
+ nir_foreach_block (block, fxn) {
+ instruction_count += exec_list_length(&block->instr_list);
+ }
- unsigned num_driver_params = 0;
- if (so->type == MESA_SHADER_VERTEX) {
- num_driver_params = IR3_DP_VS_COUNT;
- } else if (so->type == MESA_SHADER_COMPUTE) {
- num_driver_params = IR3_DP_CS_COUNT;
+ if (instruction_count < 50) {
+ ctx->prefetch_limit = 2;
+ } else if (instruction_count < 70) {
+ ctx->prefetch_limit = 3;
+ } else {
+ ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
+ }
}
- so->constbase.driver_param = constoff;
- constoff += align(num_driver_params, 4) / 4;
-
- if ((so->type == MESA_SHADER_VERTEX) &&
- (compiler->gpu_id < 500) &&
- so->shader->stream_output.num_outputs > 0) {
- so->constbase.tfbo = constoff;
- constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+ if (shader_debug_enabled(so->type)) {
+ fprintf(stdout, "NIR (final form) for %s shader %s:\n",
+ ir3_shader_stage(so), so->shader->nir->info.name);
+ nir_print_shader(ctx->s, stdout);
}
- so->constbase.immediate = constoff;
+ ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);
return ctx;
}
ralloc_array(ctx, struct ir3_instruction *, num_components);
if (src->reg.indirect)
- addr = ir3_get_addr(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
+ addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
reg->num_components);
for (unsigned i = 0; i < num_components; i++) {
unsigned n = src->reg.base_offset * reg->num_components + i;
compile_assert(ctx, n < arr->length);
- value[i] = ir3_create_array_load(ctx, arr, n, addr);
+ value[i] = ir3_create_array_load(ctx, arr, n, addr, reg->bit_size);
}
return value;
}
void
-put_dst(struct ir3_context *ctx, nir_dest *dst)
+ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
{
unsigned bit_size = nir_dest_bit_size(*dst);
- if (bit_size < 32) {
+ /* add extra mov if dst value is HIGH reg.. in some cases not all
+ * instructions can read from HIGH regs, in cases where they can
+ * ir3_cp will clean up the extra mov:
+ */
+ for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+ if (!ctx->last_dst[i])
+ continue;
+ if (ctx->last_dst[i]->regs[0]->flags & IR3_REG_HIGH) {
+ ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32);
+ }
+ }
+
+ /* Note: 1-bit bools are stored in 32-bit regs */
+ if (bit_size == 16) {
for (unsigned i = 0; i < ctx->last_dst_n; i++) {
struct ir3_instruction *dst = ctx->last_dst[i];
- dst->regs[0]->flags |= IR3_REG_HALF;
- if (ctx->last_dst[i]->opc == OPC_META_FO)
- dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+ ir3_set_dst_type(dst, true);
+ ir3_fixup_src_type(dst);
+ if (dst->opc == OPC_META_SPLIT) {
+ ir3_set_dst_type(ssa(dst->regs[1]), true);
+ ir3_fixup_src_type(ssa(dst->regs[1]));
+ dst->regs[1]->flags |= IR3_REG_HALF;
+ }
}
}
struct ir3_instruction *addr = NULL;
if (dst->reg.indirect)
- addr = ir3_get_addr(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
+ addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
reg->num_components);
for (unsigned i = 0; i < num_components; i++) {
ralloc_free(ctx->last_dst);
}
+
ctx->last_dst = NULL;
ctx->last_dst_n = 0;
}
+static unsigned
+dest_flags(struct ir3_instruction *instr)
+{
+ return instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_HIGH);
+}
+
struct ir3_instruction *
ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
unsigned arrsz)
if (arrsz == 0)
return NULL;
- unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
+ unsigned flags = dest_flags(arr[0]);
- collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
- ir3_reg_create(collect, 0, flags); /* dst */
+ collect = ir3_instr_create2(block, OPC_META_COLLECT, 1 + arrsz);
+ __ssa_dst(collect)->flags |= flags;
for (unsigned i = 0; i < arrsz; i++) {
struct ir3_instruction *elem = arr[i];
elem = ir3_MOV(block, elem, type);
}
- compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
- ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+ compile_assert(ctx, dest_flags(elem) == flags);
+ __ssa_src(collect, elem, flags);
}
+ collect->regs[0]->wrmask = MASK(arrsz);
+
return collect;
}
/* helper for instructions that produce multiple consecutive scalar
- * outputs which need to have a split/fanout meta instruction inserted
+ * outputs which need to have a split meta instruction inserted
*/
void
ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
return;
}
+ if (src->opc == OPC_META_COLLECT) {
+ debug_assert((base + n) < src->regs_count);
+
+ for (int i = 0; i < n; i++) {
+ dst[i] = ssa(src->regs[i + base + 1]);
+ }
+
+ return;
+ }
+
+ unsigned flags = dest_flags(src);
+
for (int i = 0, j = 0; i < n; i++) {
- struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
- ir3_reg_create(split, 0, IR3_REG_SSA);
- ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
- split->fo.off = i + base;
+ struct ir3_instruction *split =
+ ir3_instr_create(block, OPC_META_SPLIT);
+ __ssa_dst(split)->flags |= flags;
+ __ssa_src(split, src, flags);
+ split->split.off = i + base;
if (prev) {
split->cp.left = prev;
}
}
-void
+NORETURN void
ir3_context_error(struct ir3_context *ctx, const char *format, ...)
{
struct hash_table *errors = NULL;
nir_print_shader_annotated(ctx->s, stdout, errors);
ralloc_free(errors);
ctx->error = true;
- debug_assert(0);
+ unreachable("");
}
static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
{
struct ir3_instruction *instr, *immed;
- /* TODO in at least some cases, the backend could probably be
- * made clever enough to propagate IR3_REG_HALF..
- */
instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
- instr->regs[0]->flags |= IR3_REG_HALF;
switch(align){
case 1:
break;
case 2:
/* src *= 2 => src <<= 1: */
- immed = create_immed(block, 1);
- immed->regs[0]->flags |= IR3_REG_HALF;
-
+ immed = create_immed_typed(block, 1, TYPE_S16);
instr = ir3_SHL_B(block, instr, 0, immed, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
break;
case 3:
/* src *= 3: */
- immed = create_immed(block, 3);
- immed->regs[0]->flags |= IR3_REG_HALF;
-
+ immed = create_immed_typed(block, 3, TYPE_S16);
instr = ir3_MULL_U(block, instr, 0, immed, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
break;
case 4:
/* src *= 4 => src <<= 2: */
- immed = create_immed(block, 2);
- immed->regs[0]->flags |= IR3_REG_HALF;
-
+ immed = create_immed_typed(block, 2, TYPE_S16);
instr = ir3_SHL_B(block, instr, 0, immed, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
break;
default:
unreachable("bad align");
return NULL;
}
+ instr->regs[0]->flags |= IR3_REG_HALF;
+
instr = ir3_MOV(block, instr, TYPE_S16);
instr->regs[0]->num = regid(REG_A0, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
+ instr->regs[0]->flags &= ~IR3_REG_SSA;
+
+ return instr;
+}
+
+static struct ir3_instruction *
+create_addr1(struct ir3_block *block, unsigned const_val)
+{
+ struct ir3_instruction *immed = create_immed_typed(block, const_val, TYPE_S16);
+ struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16);
+ instr->regs[0]->num = regid(REG_A0, 1);
+ instr->regs[0]->flags &= ~IR3_REG_SSA;
return instr;
}
* sequences for each use of a given NIR level src as address
*/
struct ir3_instruction *
-ir3_get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
{
struct ir3_instruction *addr;
unsigned idx = align - 1;
- compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+ compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));
- if (!ctx->addr_ht[idx]) {
- ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+ if (!ctx->addr0_ht[idx]) {
+ ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx,
_mesa_hash_pointer, _mesa_key_pointer_equal);
} else {
struct hash_entry *entry;
- entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+ entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
if (entry)
return entry->data;
}
- addr = create_addr(ctx->block, src, align);
- _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+ addr = create_addr0(ctx->block, src, align);
+ _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
+
+ return addr;
+}
+
+/* Similar to ir3_get_addr0, but for a1.x. */
+struct ir3_instruction *
+ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
+{
+ struct ir3_instruction *addr;
+
+ if (!ctx->addr1_ht) {
+ ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
+ } else {
+ addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
+ if (addr)
+ return addr;
+ }
+
+ addr = create_addr1(ctx->block, const_val);
+ _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);
return addr;
}
/* condition always goes in predicate register: */
cond->regs[0]->num = regid(REG_P0, 0);
+ cond->regs[0]->flags &= ~IR3_REG_SSA;
return cond;
}
struct ir3_array *
ir3_get_array(struct ir3_context *ctx, nir_register *reg)
{
- list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ foreach_array (arr, &ctx->ir->array_list) {
if (arr->r == reg)
return arr;
}
/* relative (indirect) if address!=NULL */
struct ir3_instruction *
ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
- struct ir3_instruction *address)
+ struct ir3_instruction *address, unsigned bitsize)
{
struct ir3_block *block = ctx->block;
struct ir3_instruction *mov;
struct ir3_register *src;
+ unsigned flags = 0;
mov = ir3_instr_create(block, OPC_MOV);
- mov->cat1.src_type = TYPE_U32;
- mov->cat1.dst_type = TYPE_U32;
+ if (bitsize == 16) {
+ mov->cat1.src_type = TYPE_U16;
+ mov->cat1.dst_type = TYPE_U16;
+ flags |= IR3_REG_HALF;
+ arr->half = true;
+ } else {
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ }
+
mov->barrier_class = IR3_BARRIER_ARRAY_R;
mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
- ir3_reg_create(mov, 0, 0);
+ __ssa_dst(mov)->flags |= flags;
src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
- COND(address, IR3_REG_RELATIV));
+ COND(address, IR3_REG_RELATIV) | flags);
src->instr = arr->last_write;
src->size = arr->length;
src->array.id = arr->id;
/* if not relative store, don't create an extra mov, since that
* ends up being difficult for cp to remove.
+ *
+ * Also, don't skip the mov if the src is meta (like fanout/split),
+ * since that creates a situation that RA can't really handle properly.
*/
- if (!address) {
+ if (!address && !is_meta(src)) {
dst = src->regs[0];
src->barrier_class |= IR3_BARRIER_ARRAY_W;