From: Eric Anholt Date: Mon, 1 Dec 2014 19:48:20 +0000 (-0800) Subject: vc4: Pair up QPU instructions when scheduling. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=29c7cf2b2ba850cf467167548d53383e1338fd5c;p=mesa.git vc4: Pair up QPU instructions when scheduling. We've got two mostly-independent operations in each QPU instruction, so try to pack two operations together. This is fairly naive (doesn't track read and write separately in instructions, doesn't convert ADD-based MOVs into MUL-based movs, doesn't reorder across uniform loads), but does show a decent improvement on shader-db-2. total instructions in shared programs: 59583 -> 57651 (-3.24%) instructions in affected programs: 47361 -> 45429 (-4.08%) --- diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c index 723b3613665..54c79e9d4f1 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.c +++ b/src/gallium/drivers/vc4/vc4_qpu.c @@ -192,36 +192,58 @@ qpu_m_alu2(enum qpu_op_mul op, return inst; } -static uint64_t -merge_fields(uint64_t merge, - uint64_t add, uint64_t mul, +static bool +merge_fields(uint64_t *merge, + uint64_t a, uint64_t b, uint64_t mask, uint64_t ignore) { - if ((add & mask) == ignore) - return (merge & ~mask) | (mul & mask); - else if ((mul & mask) == ignore) - return (merge & ~mask) | (add & mask); - else { - assert((add & mask) == (mul & mask)); - return merge; + if ((a & mask) == ignore) { + *merge = (*merge & ~mask) | (b & mask); + } else if ((b & mask) == ignore) { + *merge = (*merge & ~mask) | (a & mask); + } else { + if ((a & mask) != (b & mask)) + return false; } + + return true; } uint64_t -qpu_inst(uint64_t add, uint64_t mul) +qpu_merge_inst(uint64_t a, uint64_t b) { - uint64_t merge = ((add & ~QPU_WADDR_MUL_MASK) | - (mul & ~QPU_WADDR_ADD_MASK)); + uint64_t merge = a | b; + bool ok = true; + + if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP && + QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP) + return 0; - merge = merge_fields(merge, add, mul, QPU_SIG_MASK, - QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG)); + if (QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP && + QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP) + return 0; - merge = merge_fields(merge, add, mul, QPU_RADDR_A_MASK, - QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A)); - merge = merge_fields(merge, add, mul, QPU_RADDR_B_MASK, - QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B)); + ok = ok && merge_fields(&merge, a, b, QPU_SIG_MASK, + QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG)); - return merge; + /* Misc fields that have to match exactly. */ + ok = ok && merge_fields(&merge, a, b, QPU_SF | QPU_WS | QPU_PM, + ~0); + + ok = ok && merge_fields(&merge, a, b, QPU_RADDR_A_MASK, + QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A)); + ok = ok && merge_fields(&merge, a, b, QPU_RADDR_B_MASK, + QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B)); + + ok = ok && merge_fields(&merge, a, b, QPU_WADDR_ADD_MASK, + QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD)); + ok = ok && merge_fields(&merge, a, b, QPU_WADDR_MUL_MASK, + QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL)); + + if (ok) + return merge; + else + return 0; } uint64_t diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index bf41f72c34b..eb06d1a0720 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -129,7 +129,7 @@ uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst, struct qpu_reg src0, struct qpu_reg src1); uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst, struct qpu_reg src0, struct qpu_reg src1); -uint64_t qpu_inst(uint64_t add, uint64_t mul); +uint64_t qpu_merge_inst(uint64_t a, uint64_t b); uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val); uint64_t qpu_set_sig(uint64_t inst, uint32_t sig); uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond); diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index f309034fba7..8aa83741ff5 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -465,7 +465,8 @@ get_instruction_priority(uint64_t inst) static struct schedule_node * choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, - struct simple_node *schedule_list) + struct simple_node *schedule_list, + uint64_t prev_inst) { struct schedule_node *chosen = NULL; struct simple_node *node; @@ -490,6 +491,15 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, if (pixel_scoreboard_too_soon(scoreboard, inst)) continue; + /* If we're trying to pair with another instruction, check + * that they're compatible. + */ + if (prev_inst != 0) { + inst = qpu_merge_inst(prev_inst, inst); + if (!inst) + continue; + } + int prio = get_instruction_priority(inst); /* Found a valid instruction. If nothing better comes along, @@ -570,6 +580,23 @@ compute_delay(struct schedule_node *n) } } +static void +mark_instruction_scheduled(struct simple_node *schedule_list, + struct schedule_node *node) +{ + if (!node) + return; + + for (int i = node->child_count - 1; i >= 0; i--) { + struct schedule_node *child = + node->children[i]; + + child->parent_count--; + if (child->parent_count == 0) + insert_at_head(schedule_list, &child->link); + } +} + static void schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) { @@ -598,7 +625,9 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) while (!is_empty_list(schedule_list)) { struct schedule_node *chosen = choose_instruction_to_schedule(&scoreboard, - schedule_list); + schedule_list, + 0); + struct schedule_node *merge = NULL; /* If there are no valid instructions to schedule, drop a NOP * in. @@ -610,12 +639,38 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) dump_state(schedule_list); fprintf(stderr, "chose: "); vc4_qpu_disasm(&inst, 1); - fprintf(stderr, "\n\n"); + fprintf(stderr, "\n"); } - /* Schedule this instruction onto the QPU list. */ - if (chosen) + /* Schedule this instruction onto the QPU list. Also try to + * find an instruction to pair with it. + */ + if (chosen) { remove_from_list(&chosen->link); + + merge = choose_instruction_to_schedule(&scoreboard, + schedule_list, + inst); + if (merge) { + remove_from_list(&merge->link); + inst = qpu_merge_inst(inst, merge->inst->inst); + assert(inst != 0); + + if (debug) { + fprintf(stderr, "merging: "); + vc4_qpu_disasm(&merge->inst->inst, 1); + fprintf(stderr, "\n"); + fprintf(stderr, "resulting in: "); + vc4_qpu_disasm(&inst, 1); + fprintf(stderr, "\n"); + } + } + } + + if (debug) { + fprintf(stderr, "\n"); + } + qpu_serialize_one_inst(c, inst); update_scoreboard_for_chosen(&scoreboard, inst); @@ -625,18 +680,8 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list) * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - if (chosen) { - for (int i = chosen->child_count - 1; i >= 0; i--) { - struct schedule_node *child = - chosen->children[i]; - - child->parent_count--; - if (child->parent_count == 0) { - insert_at_head(schedule_list, - &child->link); - } - } - } + mark_instruction_scheduled(schedule_list, chosen); + mark_instruction_scheduled(schedule_list, merge); scoreboard.tick++; }