freedreno/ir3: fix/rework tess levels

[mesa.git] / src / freedreno / ir3 / ir3_postsched.c
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c

index 1e706f75a24844ff34325f0884221a8ee830fd1c..ad2c9a6c529b33204574b4a192298523e0672ce6 100644 (file)
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -51,17 +51,18 @@
   */
  
  struct ir3_postsched_ctx {
-       struct ir3_context *ctx;
+       struct ir3 *ir;
+
+       struct ir3_shader_variant *v;
  
         void *mem_ctx;
         struct ir3_block *block;           /* the current block */
         struct dag *dag;
  
         struct list_head unscheduled_list; /* unscheduled instructions */
-       struct ir3_instruction *scheduled; /* last scheduled instr */
-       struct ir3_instruction *pred;      /* current p0.x user, if any */
  
-       bool error;
+       int sfu_delay;
+       int tex_delay;
  };
  
  struct ir3_postsched_node {
@@ -88,17 +89,31 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
          */
         list_delinit(&instr->node);
  
-       if (writes_pred(instr)) {
-               ctx->pred = instr;
-       }
-
         di(instr, "schedule");
  
         list_addtail(&instr->node, &instr->block->instr_list);
-       ctx->scheduled = instr;
  
         struct ir3_postsched_node *n = instr->data;
         dag_prune_head(ctx->dag, &n->dag);
+
+       if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
+               return;
+
+       if (is_sfu(instr)) {
+               ctx->sfu_delay = 8;
+       } else if (check_src_cond(instr, is_sfu)) {
+               ctx->sfu_delay = 0;
+       } else if (ctx->sfu_delay > 0) {
+               ctx->sfu_delay--;
+       }
+
+       if (is_tex_or_prefetch(instr)) {
+               ctx->tex_delay = 10;
+       } else if (check_src_cond(instr, is_tex_or_prefetch)) {
+               ctx->tex_delay = 0;
+       } else if (ctx->tex_delay > 0) {
+               ctx->tex_delay--;
+       }
  }
  
  static void
@@ -119,6 +134,27 @@ dump_state(struct ir3_postsched_ctx *ctx)
         }
  }
  
+/* Determine if this is an instruction that we'd prefer not to schedule
+ * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
+ * counter, ie. the more cycles it has been since the last SFU, the less
+ * costly a sync would be.
+ */
+static bool
+would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+{
+       if (ctx->sfu_delay) {
+               if (check_src_cond(instr, is_sfu))
+                       return true;
+       }
+
+       if (ctx->tex_delay) {
+               if (check_src_cond(instr, is_tex_or_prefetch))
+                       return true;
+       }
+
+       return false;
+}
+
  /* find instruction to schedule: */
  static struct ir3_instruction *
  choose_instr(struct ir3_postsched_ctx *ctx)
@@ -195,7 +231,35 @@ choose_instr(struct ir3_postsched_ctx *ctx)
                 return chosen->instr;
         }
  
-       /* First try to find a ready leader w/ soft delay (ie. including extra
+       /*
+        * Sometimes be better to take a nop, rather than scheduling an
+        * instruction that would require an (ss) shortly after another
+        * SFU..  ie. if last SFU was just one or two instr ago, and we
+        * could choose between taking a nop and then scheduling
+        * something else, vs scheduling the immed avail instruction that
+        * would require (ss), we are better with the nop.
+        */
+       for (unsigned delay = 0; delay < 4; delay++) {
+               foreach_sched_node (n, &ctx->dag->heads) {
+                       if (would_sync(ctx, n->instr))
+                               continue;
+
+                       unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+
+                       if (d > delay)
+                               continue;
+
+                       if (!chosen || (chosen->max_delay < n->max_delay))
+                               chosen = n;
+               }
+
+               if (chosen) {
+                       di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
+                       return chosen->instr;
+               }
+       }
+
+       /* Next try to find a ready leader w/ soft delay (ie. including extra
          * delay for things like tex fetch which can be synchronized w/ sync
          * bit (but we probably do want to schedule some other instructions
          * while we wait)
@@ -253,7 +317,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
  }
  
  struct ir3_postsched_deps_state {
-       struct ir3_context *ctx;
+       struct ir3_postsched_ctx *ctx;
  
         enum { F, R } direction;
  
@@ -338,20 +402,12 @@ static void
  calculate_deps(struct ir3_postsched_deps_state *state,
                 struct ir3_postsched_node *node)
  {
-       static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
-       struct ir3_register *reg;
         int b;
  
         /* Add dependencies on instructions that previously (or next,
          * in the reverse direction) wrote any of our src registers:
          */
         foreach_src_n (reg, i, node->instr) {
-               /* NOTE: relative access for a src can be either const or gpr: */
-               if (reg->flags & IR3_REG_RELATIV) {
-                       /* also reads a0.x: */
-                       add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
-               }
-
                 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
                         continue;
  
@@ -374,22 +430,25 @@ calculate_deps(struct ir3_postsched_deps_state *state,
                 }
         }
  
+       if (node->instr->address) {
+               add_reg_dep(state, node, node->instr->address->regs[0],
+                                       node->instr->address->regs[0]->num,
+                                       false);
+       }
+
         if (dest_regs(node->instr) == 0)
                 return;
  
         /* And then after we update the state for what this instruction
          * wrote:
          */
-       reg = node->instr->regs[0];
+       struct ir3_register *reg = node->instr->regs[0];
         if (reg->flags & IR3_REG_RELATIV) {
                 /* mark the entire array as written: */
                 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
                 for (unsigned i = 0; i < arr->length; i++) {
                         add_reg_dep(state, node, reg, arr->reg + i, true);
                 }
-
-               /* also reads a0.x: */
-               add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
         } else {
                 foreach_bit (b, reg->wrmask) {
                         add_reg_dep(state, node, reg, reg->num + b, true);
@@ -401,9 +460,9 @@ static void
  calculate_forward_deps(struct ir3_postsched_ctx *ctx)
  {
         struct ir3_postsched_deps_state state = {
-                       .ctx = ctx->ctx,
+                       .ctx = ctx,
                         .direction = F,
-                       .merged = ctx->ctx->compiler->gpu_id >= 600,
+                       .merged = ctx->v->mergedregs,
         };
  
         foreach_instr (instr, &ctx->unscheduled_list) {
@@ -415,9 +474,9 @@ static void
  calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
  {
         struct ir3_postsched_deps_state state = {
-                       .ctx = ctx->ctx,
+                       .ctx = ctx,
                         .direction = R,
-                       .merged = ctx->ctx->compiler->gpu_id >= 600,
+                       .merged = ctx->v->mergedregs,
         };
  
         foreach_instr_rev (instr, &ctx->unscheduled_list) {
@@ -463,6 +522,14 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
         calculate_forward_deps(ctx);
         calculate_reverse_deps(ctx);
  
+       /*
+        * To avoid expensive texture fetches, etc, from being moved ahead
+        * of kills, track the kills we've seen so far, so we can add an
+        * extra dependency on them for tex/mem instructions
+        */
+       struct util_dynarray kills;
+       util_dynarray_init(&kills, ctx->mem_ctx);
+
         /*
          * Normal srcs won't be in SSA at this point, those are dealt with in
          * calculate_forward_deps() and calculate_reverse_deps().  But we still
@@ -471,7 +538,6 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
          */
         foreach_instr (instr, &ctx->unscheduled_list) {
                 struct ir3_postsched_node *n = instr->data;
-               struct ir3_instruction *src;
  
                 foreach_ssa_src_n (src, i, instr) {
                         if (src->block != instr->block)
@@ -489,6 +555,16 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
  
                         dag_add_edge(&sn->dag, &n->dag, NULL);
                 }
+
+               if (is_kill(instr)) {
+                       util_dynarray_append(&kills, struct ir3_instruction *, instr);
+               } else if (is_tex(instr) || is_mem(instr)) {
+                       util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
+                               struct ir3_instruction *kill = *instrp;
+                               struct ir3_postsched_node *kn = kill->data;
+                               dag_add_edge(&kn->dag, &n->dag, NULL);
+                       }
+               }
         }
  
         // TODO do we want to do this after reverse-dependencies?
@@ -507,8 +583,8 @@ static void
  sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
  {
         ctx->block = block;
-       ctx->scheduled = NULL;
-       ctx->pred = NULL;
+       ctx->tex_delay = 0;
+       ctx->sfu_delay = 0;
  
         /* move all instructions to the unscheduled list, and
          * empty the block's instruction list (to which we will
@@ -523,7 +599,7 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
         foreach_instr_safe (instr, &ctx->unscheduled_list) {
                 switch (instr->opc) {
                 case OPC_NOP:
-               case OPC_BR:
+               case OPC_B:
                 case OPC_JUMP:
                         list_delinit(&instr->node);
                         break;
@@ -552,15 +628,7 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
                         schedule(ctx, instr);
  
         while (!list_is_empty(&ctx->unscheduled_list)) {
-               struct ir3_instruction *instr;
-
-               instr = choose_instr(ctx);
-
-               /* this shouldn't happen: */
-               if (!instr) {
-                       ctx->error = true;
-                       break;
-               }
+               struct ir3_instruction *instr = choose_instr(ctx);
  
                 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
                 d("delay=%u", delay);
@@ -613,7 +681,6 @@ cleanup_self_movs(struct ir3 *ir)
  {
         foreach_block (block, &ir->block_list) {
                 foreach_instr_safe (instr, &block->instr_list) {
-                       struct ir3_register *reg;
  
                         foreach_src (reg, instr) {
                                 if (!reg->instr)
@@ -626,7 +693,7 @@ cleanup_self_movs(struct ir3 *ir)
                         }
  
                         for (unsigned i = 0; i < instr->deps_count; i++) {
-                               if (is_self_mov(instr->deps[i])) {
+                               if (instr->deps[i] && is_self_mov(instr->deps[i])) {
                                         list_delinit(&instr->deps[i]->node);
                                         instr->deps[i] = instr->deps[i]->regs[1]->instr;
                                 }
@@ -635,22 +702,20 @@ cleanup_self_movs(struct ir3 *ir)
         }
  }
  
-int
-ir3_postsched(struct ir3_context *cctx)
+bool
+ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
  {
         struct ir3_postsched_ctx ctx = {
-                       .ctx = cctx,
+                       .ir = ir,
+                       .v  = v,
         };
  
-       ir3_remove_nops(cctx->ir);
-       cleanup_self_movs(cctx->ir);
+       ir3_remove_nops(ir);
+       cleanup_self_movs(ir);
  
-       foreach_block (block, &cctx->ir->block_list) {
+       foreach_block (block, &ir->block_list) {
                 sched_block(&ctx, block);
         }
  
-       if (ctx.error)
-               return -1;
-
-       return 0;
+       return true;
  }