*/
ctx->s = nir_shader_clone(ctx, so->shader->nir);
- if (ir3_key_lowers_nir(&so->key))
- ir3_optimize_nir(so->shader, ctx->s, &so->key);
+ ir3_nir_lower_variant(so, ctx->s);
/* this needs to be the last pass run, so do this here instead of
* in ir3_optimize_nir():
*/
- NIR_PASS_V(ctx->s, nir_lower_bool_to_bitsize);
bool progress = false;
NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);
NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
+ /* Super crude heuristic to limit # of tex prefetch in small
+ * shaders. This completely ignores loops.. but that's really
+ * not the worst of it's problems. (A frag shader that has
+ * loops is probably going to be big enough to not trigger a
+ * lower threshold.)
+ *
+ * 1) probably want to do this in terms of ir3 instructions
+ * 2) probably really want to decide this after scheduling
+ * (or at least pre-RA sched) so we have a rough idea about
+ * nops, and don't count things that get cp'd away
+ * 3) blob seems to use higher thresholds with a mix of more
+ * SFU instructions. Which partly makes sense, more SFU
+ * instructions probably means you want to get the real
+ * shader started sooner, but that considers where in the
+ * shader the SFU instructions are, which blob doesn't seem
+ * to do.
+ *
+ * This uses more conservative thresholds assuming a more alu
+ * than sfu heavy instruction mix.
+ */
+ if (so->type == MESA_SHADER_FRAGMENT) {
+ nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+ unsigned instruction_count = 0;
+ nir_foreach_block (block, fxn) {
+ instruction_count += exec_list_length(&block->instr_list);
+ }
+
+ if (instruction_count < 50) {
+ ctx->prefetch_limit = 2;
+ } else if (instruction_count < 70) {
+ ctx->prefetch_limit = 3;
+ } else {
+ ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
+ }
+ }
+
if (shader_debug_enabled(so->type)) {
fprintf(stdout, "NIR (final form) for %s shader %s:\n",
ir3_shader_stage(so), so->shader->nir->info.name);
for (unsigned i = 0; i < num_components; i++) {
unsigned n = src->reg.base_offset * reg->num_components + i;
compile_assert(ctx, n < arr->length);
- value[i] = ir3_create_array_load(ctx, arr, n, addr, reg->bit_size);
+ value[i] = ir3_create_array_load(ctx, arr, n, addr);
}
return value;
}
}
- if (bit_size < 32) {
+ /* Note: 1-bit bools are stored in 32-bit regs */
+ if (bit_size == 16) {
for (unsigned i = 0; i < ctx->last_dst_n; i++) {
struct ir3_instruction *dst = ctx->last_dst[i];
- dst->regs[0]->flags |= IR3_REG_HALF;
- if (ctx->last_dst[i]->opc == OPC_META_SPLIT)
- dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+ ir3_set_dst_type(dst, true);
+ ir3_fixup_src_type(dst);
+ if (dst->opc == OPC_META_SPLIT) {
+ ir3_set_dst_type(ssa(dst->regs[1]), true);
+ ir3_fixup_src_type(ssa(dst->regs[1]));
+ dst->regs[1]->flags |= IR3_REG_HALF;
+ }
}
}
{
struct ir3_instruction *instr, *immed;
- /* TODO in at least some cases, the backend could probably be
- * made clever enough to propagate IR3_REG_HALF..
- */
instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
- instr->regs[0]->flags |= IR3_REG_HALF;
switch(align){
case 1:
break;
case 2:
/* src *= 2 => src <<= 1: */
- immed = create_immed(block, 1);
- immed->regs[0]->flags |= IR3_REG_HALF;
-
+ immed = create_immed_typed(block, 1, TYPE_S16);
instr = ir3_SHL_B(block, instr, 0, immed, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
break;
case 3:
/* src *= 3: */
- immed = create_immed(block, 3);
- immed->regs[0]->flags |= IR3_REG_HALF;
-
+ immed = create_immed_typed(block, 3, TYPE_S16);
instr = ir3_MULL_U(block, instr, 0, immed, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
break;
case 4:
/* src *= 4 => src <<= 2: */
- immed = create_immed(block, 2);
- immed->regs[0]->flags |= IR3_REG_HALF;
-
+ immed = create_immed_typed(block, 2, TYPE_S16);
instr = ir3_SHL_B(block, instr, 0, immed, 0);
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
break;
default:
unreachable("bad align");
return NULL;
}
+ instr->regs[0]->flags |= IR3_REG_HALF;
+
instr = ir3_MOV(block, instr, TYPE_S16);
instr->regs[0]->num = regid(REG_A0, 0);
instr->regs[0]->flags &= ~IR3_REG_SSA;
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
return instr;
}
create_addr1(struct ir3_block *block, unsigned const_val)
{
- struct ir3_instruction *immed = create_immed(block, const_val);
+ struct ir3_instruction *immed = create_immed_typed(block, const_val, TYPE_S16);
struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16);
instr->regs[0]->num = regid(REG_A0, 1);
instr->regs[0]->flags &= ~IR3_REG_SSA;
- instr->regs[0]->flags |= IR3_REG_HALF;
- instr->regs[1]->flags |= IR3_REG_HALF;
return instr;
}
arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
compile_assert(ctx, arr->length > 0);
arr->r = reg;
+ arr->half = reg->bit_size <= 16;
+ // HACK one-bit bools still end up as 32b:
+ if (reg->bit_size == 1)
+ arr->half = false;
list_addtail(&arr->node, &ctx->ir->array_list);
}
/* relative (indirect) if address!=NULL */
struct ir3_instruction *
ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
- struct ir3_instruction *address, unsigned bitsize)
+ struct ir3_instruction *address)
{
struct ir3_block *block = ctx->block;
struct ir3_instruction *mov;
unsigned flags = 0;
mov = ir3_instr_create(block, OPC_MOV);
- if (bitsize < 32) {
+ if (arr->half) {
mov->cat1.src_type = TYPE_U16;
mov->cat1.dst_type = TYPE_U16;
flags |= IR3_REG_HALF;
- arr->half = true;
} else {
mov->cat1.src_type = TYPE_U32;
mov->cat1.dst_type = TYPE_U32;
struct ir3_block *block = ctx->block;
struct ir3_instruction *mov;
struct ir3_register *dst;
+ unsigned flags = 0;
/* if not relative store, don't create an extra mov, since that
* ends up being difficult for cp to remove.
}
mov = ir3_instr_create(block, OPC_MOV);
- mov->cat1.src_type = TYPE_U32;
- mov->cat1.dst_type = TYPE_U32;
+ if (arr->half) {
+ mov->cat1.src_type = TYPE_U16;
+ mov->cat1.dst_type = TYPE_U16;
+ flags |= IR3_REG_HALF;
+ } else {
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ }
mov->barrier_class = IR3_BARRIER_ARRAY_W;
mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+ flags |
COND(address, IR3_REG_RELATIV));
dst->instr = arr->last_write;
dst->size = arr->length;
dst->array.id = arr->id;
dst->array.offset = n;
- ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+ ir3_reg_create(mov, 0, IR3_REG_SSA | flags)->instr = src;
if (address)
ir3_instr_set_address(mov, address);