From: Eric Anholt <eric@anholt.net>
Date: Mon, 1 Dec 2014 19:48:20 +0000 (-0800)
Subject: vc4: Pair up QPU instructions when scheduling.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=29c7cf2b2ba850cf467167548d53383e1338fd5c;p=mesa.git

vc4: Pair up QPU instructions when scheduling.

We've got two mostly-independent operations in each QPU instruction, so
try to pack two operations together.  This is fairly naive (doesn't track
read and write separately in instructions, doesn't convert ADD-based MOVs
into MUL-based movs, doesn't reorder across uniform loads), but does show
a decent improvement on shader-db-2.

total instructions in shared programs: 59583 -> 57651 (-3.24%)
instructions in affected programs:     47361 -> 45429 (-4.08%)
---

diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c
index 723b3613665..54c79e9d4f1 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.c
+++ b/src/gallium/drivers/vc4/vc4_qpu.c
@@ -192,36 +192,58 @@ qpu_m_alu2(enum qpu_op_mul op,
         return inst;
 }
 
-static uint64_t
-merge_fields(uint64_t merge,
-             uint64_t add, uint64_t mul,
+static bool
+merge_fields(uint64_t *merge,
+             uint64_t a, uint64_t b,
              uint64_t mask, uint64_t ignore)
 {
-        if ((add & mask) == ignore)
-                return (merge & ~mask) | (mul & mask);
-        else if ((mul & mask) == ignore)
-                return (merge & ~mask) | (add & mask);
-        else {
-                assert((add & mask) == (mul & mask));
-                return merge;
+        if ((a & mask) == ignore) {
+                *merge = (*merge & ~mask) | (b & mask);
+        } else if ((b & mask) == ignore) {
+                *merge = (*merge & ~mask) | (a & mask);
+        } else {
+                if ((a & mask) != (b & mask))
+                        return false;
         }
+
+        return true;
 }
 
 uint64_t
-qpu_inst(uint64_t add, uint64_t mul)
+qpu_merge_inst(uint64_t a, uint64_t b)
 {
-        uint64_t merge = ((add & ~QPU_WADDR_MUL_MASK) |
-                          (mul & ~QPU_WADDR_ADD_MASK));
+        uint64_t merge = a | b;
+        bool ok = true;
+
+        if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP &&
+            QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP)
+                return 0;
 
-        merge = merge_fields(merge, add, mul, QPU_SIG_MASK,
-                             QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));
+        if (QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP &&
+            QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP)
+                return 0;
 
-        merge = merge_fields(merge, add, mul, QPU_RADDR_A_MASK,
-                             QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A));
-        merge = merge_fields(merge, add, mul, QPU_RADDR_B_MASK,
-                             QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));
+        ok = ok && merge_fields(&merge, a, b, QPU_SIG_MASK,
+                                QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));
 
-        return merge;
+        /* Misc fields that have to match exactly. */
+        ok = ok && merge_fields(&merge, a, b, QPU_SF | QPU_WS | QPU_PM,
+                                ~0);
+
+        ok = ok && merge_fields(&merge, a, b, QPU_RADDR_A_MASK,
+                                QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A));
+        ok = ok && merge_fields(&merge, a, b, QPU_RADDR_B_MASK,
+                                QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));
+
+        ok = ok && merge_fields(&merge, a, b, QPU_WADDR_ADD_MASK,
+                                QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD));
+        ok = ok && merge_fields(&merge, a, b, QPU_WADDR_MUL_MASK,
+                                QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL));
+
+        if (ok)
+                return merge;
+        else
+                return 0;
 }
 
 uint64_t
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index bf41f72c34b..eb06d1a0720 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -129,7 +129,7 @@ uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
                     struct qpu_reg src0, struct qpu_reg src1);
 uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
                     struct qpu_reg src0, struct qpu_reg src1);
-uint64_t qpu_inst(uint64_t add, uint64_t mul);
+uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
 uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
 uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
 uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index f309034fba7..8aa83741ff5 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -465,7 +465,8 @@ get_instruction_priority(uint64_t inst)
 
 static struct schedule_node *
 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
-                               struct simple_node *schedule_list)
+                               struct simple_node *schedule_list,
+                               uint64_t prev_inst)
 {
         struct schedule_node *chosen = NULL;
         struct simple_node *node;
@@ -490,6 +491,15 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
                 if (pixel_scoreboard_too_soon(scoreboard, inst))
                         continue;
 
+                /* If we're trying to pair with another instruction, check
+                 * that they're compatible.
+                 */
+                if (prev_inst != 0) {
+                        inst = qpu_merge_inst(prev_inst, inst);
+                        if (!inst)
+                                continue;
+                }
+
                 int prio = get_instruction_priority(inst);
 
                 /* Found a valid instruction.  If nothing better comes along,
@@ -570,6 +580,23 @@ compute_delay(struct schedule_node *n)
         }
 }
 
+static void
+mark_instruction_scheduled(struct simple_node *schedule_list,
+                           struct schedule_node *node)
+{
+        if (!node)
+                return;
+
+        for (int i = node->child_count - 1; i >= 0; i--) {
+                struct schedule_node *child =
+                        node->children[i];
+
+                child->parent_count--;
+                if (child->parent_count == 0)
+                        insert_at_head(schedule_list, &child->link);
+        }
+}
+
 static void
 schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
 {
@@ -598,7 +625,9 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
         while (!is_empty_list(schedule_list)) {
                 struct schedule_node *chosen =
                         choose_instruction_to_schedule(&scoreboard,
-                                                       schedule_list);
+                                                       schedule_list,
+                                                       0);
+                struct schedule_node *merge = NULL;
 
                 /* If there are no valid instructions to schedule, drop a NOP
                  * in.
@@ -610,12 +639,38 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
                         dump_state(schedule_list);
                         fprintf(stderr, "chose: ");
                         vc4_qpu_disasm(&inst, 1);
-                        fprintf(stderr, "\n\n");
+                        fprintf(stderr, "\n");
                 }
 
-                /* Schedule this instruction onto the QPU list. */
-                if (chosen)
+                /* Schedule this instruction onto the QPU list. Also try to
+                 * find an instruction to pair with it.
+                 */
+                if (chosen) {
                         remove_from_list(&chosen->link);
+
+                        merge = choose_instruction_to_schedule(&scoreboard,
+                                                               schedule_list,
+                                                               inst);
+                        if (merge) {
+                                remove_from_list(&merge->link);
+                                inst = qpu_merge_inst(inst, merge->inst->inst);
+                                assert(inst != 0);
+
+                                if (debug) {
+                                        fprintf(stderr, "merging: ");
+                                        vc4_qpu_disasm(&merge->inst->inst, 1);
+                                        fprintf(stderr, "\n");
+                                        fprintf(stderr, "resulting in: ");
+                                        vc4_qpu_disasm(&inst, 1);
+                                        fprintf(stderr, "\n");
+                                }
+                        }
+                }
+
+                if (debug) {
+                        fprintf(stderr, "\n");
+                }
+
                 qpu_serialize_one_inst(c, inst);
 
                 update_scoreboard_for_chosen(&scoreboard, inst);
@@ -625,18 +680,8 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
                  * be scheduled.  Update the children's unblocked time for this
                  * DAG edge as we do so.
                  */
-                if (chosen) {
-                        for (int i = chosen->child_count - 1; i >= 0; i--) {
-                                struct schedule_node *child =
-                                        chosen->children[i];
-
-                                child->parent_count--;
-                                if (child->parent_count == 0) {
-                                        insert_at_head(schedule_list,
-                                                       &child->link);
-                                }
-                        }
-                }
+                mark_instruction_scheduled(schedule_list, chosen);
+                mark_instruction_scheduled(schedule_list, merge);
 
                 scoreboard.tick++;
         }