freedreno/afuc: Fix printing preemptleave on a5xx

[mesa.git] / src / freedreno / ir3 / ir3_delay.c
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c

index 207c8cb91ccf0ddf257d3466af47378e829d5bbb..247ff6ee25064a639cfb5847235686b5fda8e3b7 100644 (file)
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -49,7 +49,6 @@ ignore_dep(struct ir3_instruction *assigner,
  
         if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
                 struct ir3_register *dst = assigner->regs[0];
-               struct ir3_register *src;
  
                 debug_assert(dst->flags & IR3_REG_ARRAY);
  
@@ -69,7 +68,7 @@ ignore_dep(struct ir3_instruction *assigner,
   */
  int
  ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
+               struct ir3_instruction *consumer, unsigned n, bool soft)
  {
         if (ignore_dep(assigner, consumer, n))
                 return 0;
@@ -82,9 +81,23 @@ ir3_delayslots(struct ir3_instruction *assigner,
         if (is_meta(assigner) || is_meta(consumer))
                 return 0;
  
-       if (writes_addr(assigner))
+       if (writes_addr0(assigner) || writes_addr1(assigner))
                 return 6;
  
+       /* On a6xx, it takes the number of delay slots to get a SFU result
+        * back (ie. using nop's instead of (ss) is:
+        *
+        *     8 - single warp
+        *     9 - two warps
+        *    10 - four warps
+        *
+        * and so on.  Not quite sure where it tapers out (ie. how many
+        * warps share an SFU unit).  But 10 seems like a reasonable #
+        * to choose:
+        */
+       if (soft && is_sfu(assigner))
+               return 10;
+
         /* handled via sync flags: */
         if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
                 return 0;
@@ -109,7 +122,7 @@ count_instruction(struct ir3_instruction *n)
          * be eliminated later in resolve_jumps().. really should do that
          * earlier so we don't have this constraint.
          */
-       return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
+       return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
  }
  
  /**
@@ -126,8 +139,8 @@ count_instruction(struct ir3_instruction *n)
   *    find the worst case (shortest) distance (only possible after
   *    individual blocks are all scheduled)
   */
-unsigned
-ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
                 unsigned maxd, bool pred)
  {
         unsigned d = 0;
@@ -162,7 +175,7 @@ ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
                         struct ir3_block *pred = (struct ir3_block *)entry->key;
                         unsigned n;
  
-                       n = ir3_distance(pred, instr, min, pred);
+                       n = distance(pred, instr, min, pred);
  
                         min = MIN2(min, n);
                 }
@@ -184,27 +197,40 @@ delay_calc_srcn(struct ir3_block *block,
         unsigned delay = 0;
  
         if (is_meta(assigner)) {
-               struct ir3_register *src;
-               foreach_src (src, assigner) {
+               foreach_src_n (src, n, assigner) {
                         unsigned d;
  
                         if (!src->instr)
                                 continue;
  
                         d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
+
+                       /* A (rptN) instruction executes in consecutive cycles so
+                        * it's outputs are written in successive cycles.  And
+                        * likewise for it's (r)'d (incremented) inputs, they are
+                        * read on successive cycles.
+                        *
+                        * So we need to adjust the delay for (rptN)'s assigners
+                        * and consumers accordingly.
+                        *
+                        * Note that the dst of a (rptN) instruction is implicitly
+                        * (r) (the assigner case), although that is not the case
+                        * for src registers.  There is exactly one case, bary.f,
+                        * which has a vecN (collect) src that is not (r)'d.
+                        */
+                       if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
+                               /* (rptN) assigner case: */
+                               d -= MIN2(d, src->instr->repeat - assigner->split.off);
+                       } else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
+                                       (consumer->regs[srcn]->flags & IR3_REG_R)) {
+                               d -= MIN2(d, n);
+                       }
+
                         delay = MAX2(delay, d);
                 }
         } else {
-               if (soft) {
-                       if (is_sfu(assigner)) {
-                               delay = 4;
-                       } else {
-                               delay = ir3_delayslots(assigner, consumer, srcn);
-                       }
-               } else {
-                       delay = ir3_delayslots(assigner, consumer, srcn);
-               }
-               delay -= ir3_distance(block, assigner, delay, pred);
+               delay = ir3_delayslots(assigner, consumer, srcn, soft);
+               delay -= distance(block, assigner, delay, pred);
         }
  
         return delay;
@@ -314,7 +340,6 @@ ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
                 bool soft, bool pred)
  {
         unsigned delay = 0;
-       struct ir3_register *src;
  
         foreach_src_n (src, i, instr) {
                 unsigned d = 0;