From c341ab7ffbac822d3d3cbb3d3ae9d2a19ea3cc9a Mon Sep 17 00:00:00 2001 From: Jose Maria Casanova Crespo Date: Tue, 2 Jul 2019 18:31:09 +0200 Subject: [PATCH] v3d: add shader-db stat to count SFU stalls SFU operations have a latency of 2 cicles, so if their results are used in the following cycle to a SFU instruction, the GPU stalls for an extra cycle until the result is available. This adds the number of stalls to the shader-db debug mode and sum of instruction + stalls to evaluate optimizations to schedule instructions that avoid generating sfu-stalls. v2: Rename v3d_qpu_generates_sfu_stalls to v3d_qpu_instr_is_sfu (Eric) Reviewed-by: Eric Anholt --- src/broadcom/compiler/qpu_schedule.c | 45 ++++++++++++++++++++++++++++ src/broadcom/compiler/v3d_compiler.h | 1 + src/broadcom/compiler/vir.c | 7 +++-- src/broadcom/qpu/qpu_instr.c | 34 +++++++++++++-------- src/broadcom/qpu/qpu_instr.h | 1 + 5 files changed, 74 insertions(+), 14 deletions(-) diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index b8e04f6ea13..370881b00ad 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -440,6 +440,8 @@ struct choose_scoreboard { struct dag *dag; int tick; int last_magic_sfu_write_tick; + int last_stallable_sfu_reg; + int last_stallable_sfu_tick; int last_ldvary_tick; int last_uniforms_reset_tick; int last_thrsw_tick; @@ -531,6 +533,33 @@ pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); } +static bool +qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, + uint32_t waddr) { + + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm && (inst->raddr_b == waddr)) + return true; + + return false; +} + +static bool +mux_read_stalls(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && + qpu_instruction_uses_rf(inst, + scoreboard->last_stallable_sfu_reg); +} + static int get_instruction_priority(const struct v3d_qpu_instr *inst) { @@ -851,6 +880,16 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, scoreboard->last_magic_sfu_write_tick = scoreboard->tick; } +static void +update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + if (v3d_qpu_instr_is_sfu(inst)) { + scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; + scoreboard->last_stallable_sfu_tick = scoreboard->tick; + } +} + static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) @@ -864,6 +903,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.add.waddr); + } else { + update_scoreboard_for_sfu_stall_waddr(scoreboard, + inst); } } @@ -1298,6 +1340,8 @@ schedule_instructions(struct v3d_compile *c, fprintf(stderr, "\n"); } } + if (mux_read_stalls(scoreboard, inst)) + c->qpu_inst_stalled_count++; } /* Update the uniform index for the rewritten location -- @@ -1481,6 +1525,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_magic_sfu_write_tick = -10; scoreboard.last_uniforms_reset_tick = -10; scoreboard.last_thrsw_tick = -10; + scoreboard.last_stallable_sfu_tick = -10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index da32d47a28d..b61119f5615 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -613,6 +613,7 @@ struct v3d_compile { uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; + uint32_t qpu_inst_stalled_count; /* For the FS, the number of varying inputs not counting the * point/line varyings payload diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 04129fa522e..eed3fc18b12 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -947,7 +947,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, char *shaderdb; int ret = asprintf(&shaderdb, "%s shader: %d inst, %d threads, %d loops, " - "%d uniforms, %d max-temps, %d:%d spills:fills", + "%d uniforms, %d max-temps, %d:%d spills:fills, " + "%d sfu-stalls, %d inst-and-stalls", vir_get_stage_name(c), c->qpu_inst_count, c->threads, @@ -955,7 +956,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, c->num_uniforms, vir_get_max_temps(c), c->spills, - c->fills); + c->fills, + c->qpu_inst_stalled_count, + c->qpu_inst_count + c->qpu_inst_stalled_count); if (ret >= 0) { if (V3D_DEBUG & V3D_DEBUG_SHADERDB) fprintf(stderr, "SHADER-DB: %s\n", shaderdb); diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c index 66e53a6accd..09d06b3fa3e 100644 --- a/src/broadcom/qpu/qpu_instr.c +++ b/src/broadcom/qpu/qpu_instr.c @@ -645,19 +645,10 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) { - if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - switch (inst->alu.add.op) { - case V3D_QPU_A_RECIP: - case V3D_QPU_A_RSQRT: - case V3D_QPU_A_EXP: - case V3D_QPU_A_LOG: - case V3D_QPU_A_SIN: - case V3D_QPU_A_RSQRT2: - return true; - default: - break; - } + if (v3d_qpu_instr_is_sfu(inst)) + return true; + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) { return true; @@ -672,6 +663,25 @@ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) return false; } +bool +v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) +{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->alu.add.op) { + case V3D_QPU_A_RECIP: + case V3D_QPU_A_RSQRT: + case V3D_QPU_A_EXP: + case V3D_QPU_A_LOG: + case V3D_QPU_A_SIN: + case V3D_QPU_A_RSQRT2: + return true; + default: + return false; + } + } + return false; +} + bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) { diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h index 968d0f6fd65..ad2d37b6051 100644 --- a/src/broadcom/qpu/qpu_instr.h +++ b/src/broadcom/qpu/qpu_instr.h @@ -447,6 +447,7 @@ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; -- 2.30.2