freedreno/ir3: update SFU delay
authorRob Clark <robdclark@chromium.org>
Tue, 25 Feb 2020 18:44:26 +0000 (10:44 -0800)
committerMarge Bot <eric+marge@anholt.net>
Fri, 28 Feb 2020 16:53:41 +0000 (16:53 +0000)
1) emperically, 10 seems like a more accurate # than 4
2) push "soft" delay handling into ir3_delayslots(), as
   we should also be using it to calculate the costs
   that the schedulers use

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3989>

src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_delay.c
src/freedreno/ir3/ir3_depth.c
src/freedreno/ir3/ir3_postsched.c

index 21fd8c602b98561839a1b46be4a323835d4b91c9..b66d8e2d6fd14897e2d05f9d0a352b49740e51ac 100644 (file)
@@ -1157,7 +1157,7 @@ void ir3_print_instr(struct ir3_instruction *instr);
 
 /* delay calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n);
+               struct ir3_instruction *consumer, unsigned n, bool soft);
 unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
                bool soft, bool pred);
 void ir3_remove_nops(struct ir3 *ir);
index 0b796a4183a9b2f945a16b950d3cf676edd7ee2b..5839128a4c63ef4b838d77398a29d510fe89d589 100644 (file)
@@ -69,7 +69,7 @@ ignore_dep(struct ir3_instruction *assigner,
  */
 int
 ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
+               struct ir3_instruction *consumer, unsigned n, bool soft)
 {
        if (ignore_dep(assigner, consumer, n))
                return 0;
@@ -85,6 +85,20 @@ ir3_delayslots(struct ir3_instruction *assigner,
        if (writes_addr(assigner))
                return 6;
 
+       /* On a6xx, it takes the number of delay slots to get a SFU result
+        * back (ie. using nop's instead of (ss) is:
+        *
+        *     8 - single warp
+        *     9 - two warps
+        *    10 - four warps
+        *
+        * and so on.  Not quite sure where it tapers out (ie. how many
+        * warps share an SFU unit).  But 10 seems like a reasonable #
+        * to choose:
+        */
+       if (soft && is_sfu(assigner))
+               return 10;
+
        /* handled via sync flags: */
        if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
                return 0;
@@ -195,15 +209,7 @@ delay_calc_srcn(struct ir3_block *block,
                        delay = MAX2(delay, d);
                }
        } else {
-               if (soft) {
-                       if (is_sfu(assigner)) {
-                               delay = 4;
-                       } else {
-                               delay = ir3_delayslots(assigner, consumer, srcn);
-                       }
-               } else {
-                       delay = ir3_delayslots(assigner, consumer, srcn);
-               }
+               delay = ir3_delayslots(assigner, consumer, srcn, soft);
                delay -= distance(block, assigner, delay, pred);
        }
 
index 135d4365d2ea5b3123705480e65ff093cf32a675..6bb946871e5b49d47d6059f12b3ca660a4db034a 100644 (file)
@@ -89,7 +89,7 @@ ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
                if (i == 0)
                        continue;
 
-               sd = ir3_delayslots(src, instr, i) + src->depth;
+               sd = ir3_delayslots(src, instr, i, true) + src->depth;
                sd += boost;
 
                instr->depth = MAX2(instr->depth, sd);
index 4290e8822499544057408090a5d65183935a1cc2..47a8e52fdeb884a5c657a7f4621bd5360aaee24b 100644 (file)
@@ -380,7 +380,7 @@ calculate_deps(struct ir3_postsched_deps_state *state,
 
                                struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
                                if (dep && (state->direction == F)) {
-                                       unsigned d = ir3_delayslots(dep->instr, node->instr, i);
+                                       unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
                                        node->delay = MAX2(node->delay, d);
                                }
                        }