freedreno/ir3: fix mismatched flags on split

[mesa.git] / src / freedreno / ir3 / ir3_context.c
diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c

index bdcf816bd120662119b94bf6d2878114a96de1b4..527cd73a66449089f7ca1adefa585ce257a52468 100644 (file)
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@@ -63,6 +63,8 @@ ir3_context_init(struct ir3_compiler *compiler,
                         _mesa_hash_pointer, _mesa_key_pointer_equal);
         ctx->block_ht = _mesa_hash_table_create(ctx,
                         _mesa_hash_pointer, _mesa_key_pointer_equal);
+       ctx->sel_cond_conversions = _mesa_hash_table_create(ctx,
+                       _mesa_hash_pointer, _mesa_key_pointer_equal);
  
         /* TODO: maybe generate some sort of bitmask of what key
          * lowers vs what shader has (ie. no need to lower
@@ -78,22 +80,30 @@ ir3_context_init(struct ir3_compiler *compiler,
         /* this needs to be the last pass run, so do this here instead of
          * in ir3_optimize_nir():
          */
-       NIR_PASS_V(ctx->s, nir_lower_bool_to_int32);
-       NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
+       bool progress = false;
+       NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);
+
+       /* we could need cleanup after lower_locals_to_regs */
+       while (progress) {
+               progress = false;
+               NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+               NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
+       }
  
         /* We want to lower nir_op_imul as late as possible, to catch also
          * those generated by earlier passes (e.g, nir_lower_locals_to_regs).
          * However, we want a final swing of a few passes to have a chance
          * at optimizing the result.
          */
-       bool progress = false;
+       progress = false;
         NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
-       if (progress) {
-               NIR_PASS_V(ctx->s, nir_opt_algebraic);
-               NIR_PASS_V(ctx->s, nir_opt_copy_prop_vars);
-               NIR_PASS_V(ctx->s, nir_opt_dead_write_vars);
-               NIR_PASS_V(ctx->s, nir_opt_dce);
-               NIR_PASS_V(ctx->s, nir_opt_constant_folding);
+       while (progress) {
+               progress = false;
+               NIR_PASS(progress, ctx->s, nir_opt_algebraic);
+               NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
+               NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
+               NIR_PASS(progress, ctx->s, nir_opt_dce);
+               NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
         }
  
         /* Enable the texture pre-fetch feature only a4xx onwards.  But
@@ -104,6 +114,43 @@ ir3_context_init(struct ir3_compiler *compiler,
  
         NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
  
+       /* Super crude heuristic to limit # of tex prefetch in small
+        * shaders.  This completely ignores loops.. but that's really
+        * not the worst of it's problems.  (A frag shader that has
+        * loops is probably going to be big enough to not trigger a
+        * lower threshold.)
+        *
+        *   1) probably want to do this in terms of ir3 instructions
+        *   2) probably really want to decide this after scheduling
+        *      (or at least pre-RA sched) so we have a rough idea about
+        *      nops, and don't count things that get cp'd away
+        *   3) blob seems to use higher thresholds with a mix of more
+        *      SFU instructions.  Which partly makes sense, more SFU
+        *      instructions probably means you want to get the real
+        *      shader started sooner, but that considers where in the
+        *      shader the SFU instructions are, which blob doesn't seem
+        *      to do.
+        *
+        * This uses more conservative thresholds assuming a more alu
+        * than sfu heavy instruction mix.
+        */
+       if (so->type == MESA_SHADER_FRAGMENT) {
+               nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+               unsigned instruction_count = 0;
+               nir_foreach_block (block, fxn) {
+                       instruction_count += exec_list_length(&block->instr_list);
+               }
+
+               if (instruction_count < 50) {
+                       ctx->prefetch_limit = 2;
+               } else if (instruction_count < 70) {
+                       ctx->prefetch_limit = 3;
+               } else {
+                       ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
+               }
+       }
+
         if (shader_debug_enabled(so->type)) {
                 fprintf(stdout, "NIR (final form) for %s shader %s:\n",
                         ir3_shader_stage(so), so->shader->nir->info.name);
@@ -175,7 +222,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src)
                         ralloc_array(ctx, struct ir3_instruction *, num_components);
  
                 if (src->reg.indirect)
-                       addr = ir3_get_addr(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
+                       addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
                                         reg->num_components);
  
                 for (unsigned i = 0; i < num_components; i++) {
@@ -205,12 +252,15 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
                 }
         }
  
-       if (bit_size < 32) {
+       /* Note: 1-bit bools are stored in 32-bit regs */
+       if (bit_size == 16) {
                 for (unsigned i = 0; i < ctx->last_dst_n; i++) {
                         struct ir3_instruction *dst = ctx->last_dst[i];
                         dst->regs[0]->flags |= IR3_REG_HALF;
-                       if (ctx->last_dst[i]->opc == OPC_META_FO)
+                       if (dst->opc == OPC_META_SPLIT) {
                                 dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+                               dst->regs[1]->flags |= IR3_REG_HALF;
+                       }
                 }
         }
  
@@ -221,7 +271,7 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
                 struct ir3_instruction *addr = NULL;
  
                 if (dst->reg.indirect)
-                       addr = ir3_get_addr(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
+                       addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
                                         reg->num_components);
  
                 for (unsigned i = 0; i < num_components; i++) {
@@ -257,8 +307,8 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
  
         unsigned flags = dest_flags(arr[0]);
  
-       collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
-       ir3_reg_create(collect, 0, flags);     /* dst */
+       collect = ir3_instr_create2(block, OPC_META_COLLECT, 1 + arrsz);
+       __ssa_dst(collect)->flags |= flags;
         for (unsigned i = 0; i < arrsz; i++) {
                 struct ir3_instruction *elem = arr[i];
  
@@ -292,7 +342,7 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
                 }
  
                 compile_assert(ctx, dest_flags(elem) == flags);
-               ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+               __ssa_src(collect, elem, flags);
         }
  
         collect->regs[0]->wrmask = MASK(arrsz);
@@ -301,7 +351,7 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
  }
  
  /* helper for instructions that produce multiple consecutive scalar
- * outputs which need to have a split/fanout meta instruction inserted
+ * outputs which need to have a split meta instruction inserted
   */
  void
  ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
@@ -314,13 +364,24 @@ ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
                 return;
         }
  
+       if (src->opc == OPC_META_COLLECT) {
+               debug_assert((base + n) < src->regs_count);
+
+               for (int i = 0; i < n; i++) {
+                       dst[i] = ssa(src->regs[i + base + 1]);
+               }
+
+               return;
+       }
+
         unsigned flags = dest_flags(src);
  
         for (int i = 0, j = 0; i < n; i++) {
-               struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
-               ir3_reg_create(split, 0, IR3_REG_SSA | flags);
-               ir3_reg_create(split, 0, IR3_REG_SSA | flags)->instr = src;
-               split->fo.off = i + base;
+               struct ir3_instruction *split =
+                               ir3_instr_create(block, OPC_META_SPLIT);
+               __ssa_dst(split)->flags |= flags;
+               __ssa_src(split, src, flags);
+               split->split.off = i + base;
  
                 if (prev) {
                         split->cp.left = prev;
@@ -358,7 +419,7 @@ ir3_context_error(struct ir3_context *ctx, const char *format, ...)
  }
  
  static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
  {
         struct ir3_instruction *instr, *immed;
  
@@ -406,35 +467,69 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
  
         instr = ir3_MOV(block, instr, TYPE_S16);
         instr->regs[0]->num = regid(REG_A0, 0);
+       instr->regs[0]->flags &= ~IR3_REG_SSA;
         instr->regs[0]->flags |= IR3_REG_HALF;
         instr->regs[1]->flags |= IR3_REG_HALF;
  
         return instr;
  }
  
+static struct ir3_instruction *
+create_addr1(struct ir3_block *block, unsigned const_val)
+{
+
+       struct ir3_instruction *immed = create_immed(block, const_val);
+       struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_S16);
+       instr->regs[0]->num = regid(REG_A0, 1);
+       instr->regs[0]->flags &= ~IR3_REG_SSA;
+       instr->regs[0]->flags |= IR3_REG_HALF;
+       instr->regs[1]->flags |= IR3_REG_HALF;
+       return instr;
+}
+
  /* caches addr values to avoid generating multiple cov/shl/mova
   * sequences for each use of a given NIR level src as address
   */
  struct ir3_instruction *
-ir3_get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
  {
         struct ir3_instruction *addr;
         unsigned idx = align - 1;
  
-       compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+       compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));
  
-       if (!ctx->addr_ht[idx]) {
-               ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+       if (!ctx->addr0_ht[idx]) {
+               ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx,
                                 _mesa_hash_pointer, _mesa_key_pointer_equal);
         } else {
                 struct hash_entry *entry;
-               entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+               entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
                 if (entry)
                         return entry->data;
         }
  
-       addr = create_addr(ctx->block, src, align);
-       _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+       addr = create_addr0(ctx->block, src, align);
+       _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
+
+       return addr;
+}
+
+/* Similar to ir3_get_addr0, but for a1.x. */
+struct ir3_instruction *
+ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
+{
+       struct ir3_instruction *addr;
+
+       if (!ctx->addr1_ht) {
+               ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
+       } else {
+               addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
+               if (addr)
+                       return addr;
+       }
+
+       addr = create_addr1(ctx->block, const_val);
+       _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);
  
         return addr;
  }
@@ -451,6 +546,7 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
  
         /* condition always goes in predicate register: */
         cond->regs[0]->num = regid(REG_P0, 0);
+       cond->regs[0]->flags &= ~IR3_REG_SSA;
  
         return cond;
  }
@@ -480,7 +576,7 @@ ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
  struct ir3_array *
  ir3_get_array(struct ir3_context *ctx, nir_register *reg)
  {
-       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+       foreach_array (arr, &ctx->ir->array_list) {
                 if (arr->r == reg)
                         return arr;
         }
@@ -499,10 +595,11 @@ ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
         unsigned flags = 0;
  
         mov = ir3_instr_create(block, OPC_MOV);
-       if (bitsize < 32) {
+       if (bitsize == 16) {
                 mov->cat1.src_type = TYPE_U16;
                 mov->cat1.dst_type = TYPE_U16;
                 flags |= IR3_REG_HALF;
+               arr->half = true;
         } else {
                 mov->cat1.src_type = TYPE_U32;
                 mov->cat1.dst_type = TYPE_U32;
@@ -510,7 +607,7 @@ ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
  
         mov->barrier_class = IR3_BARRIER_ARRAY_R;
         mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
-       ir3_reg_create(mov, 0, flags);
+       __ssa_dst(mov)->flags |= flags;
         src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
                         COND(address, IR3_REG_RELATIV) | flags);
         src->instr = arr->last_write;