From 9bf0bdf7762afef04b923c043242356ccd3de092 Mon Sep 17 00:00:00 2001 From: Jose Maria Casanova Crespo Date: Tue, 9 Jul 2019 19:23:25 +0200 Subject: [PATCH] v3d: Avoid scheduling an instruction that stalls waiting for SFU retval MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If we detect that a scheduling candidate will stall because having a register source that is the written by the SFU unit in the previous instruction we reduce its priority so any non stalling operation would be chosen. The latency of SFU operations is defined as 2. So they would be scheduled earlier if other candidates have the same priority. Finally we won't merge instructions that stall to a previously chosen one. As the result of the previous one would be waiting for an extra cycle. Although shader-db result show that instruction are hurt with an increase of 0.35% the sum of instructions + stalls is reduced a 0.52%. And the total of sfu-stalls is reduced a 63.51%. It implies also a small increase in the max-temps metric because of scheduling earlier SFU operations. total instructions in shared programs: 9102719 -> 9117851 (0.17%) instructions in affected programs: 4324628 -> 4339760 (0.35%) helped: 4162 HURT: 12128 helped stats (abs) min: 1 max: 10 x̄: 1.28 x̃: 1 helped stats (rel) min: 0.09% max: 4.76% x̄: 0.66% x̃: 0.51% HURT stats (abs) min: 1 max: 27 x̄: 1.69 x̃: 1 HURT stats (rel) min: 0.05% max: 7.69% x̄: 0.87% x̃: 0.68% 95% mean confidence interval for instructions value: 0.90 0.96 95% mean confidence interval for instructions %-change: 0.47% 0.50% Instructions are HURT. total max-temps in shared programs: 1327728 -> 1327812 (<.01%) max-temps in affected programs: 4730 -> 4814 (1.78%) helped: 61 HURT: 134 helped stats (abs) min: 1 max: 2 x̄: 1.08 x̃: 1 helped stats (rel) min: 2.70% max: 13.33% x̄: 4.89% x̃: 4.17% HURT stats (abs) min: 1 max: 3 x̄: 1.12 x̃: 1 HURT stats (rel) min: 1.54% max: 20.00% x̄: 6.10% x̃: 5.26% 95% mean confidence interval for max-temps value: 0.28 0.58 95% mean confidence interval for max-temps %-change: 1.80% 3.52% Max-temps are HURT. total sfu-stalls in shared programs: 99551 -> 36324 (-63.51%) sfu-stalls in affected programs: 95029 -> 31802 (-66.53%) helped: 25882 HURT: 0 helped stats (abs) min: 1 max: 27 x̄: 2.44 x̃: 2 helped stats (rel) min: 5.26% max: 100.00% x̄: 79.86% x̃: 100.00% 95% mean confidence interval for sfu-stalls value: -2.47 -2.42 95% mean confidence interval for sfu-stalls %-change: -80.18% -79.54% Sfu-stalls are helped. total inst-and-stalls in shared programs: 9202270 -> 9154175 (-0.52%) inst-and-stalls in affected programs: 5618516 -> 5570421 (-0.86%) helped: 22728 HURT: 855 helped stats (abs) min: 1 max: 31 x̄: 2.16 x̃: 1 helped stats (rel) min: 0.07% max: 16.67% x̄: 1.14% x̃: 0.92% HURT stats (abs) min: 1 max: 5 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.12% max: 5.26% x̄: 1.24% x̃: 0.86% 95% mean confidence interval for inst-and-stalls value: -2.07 -2.01 95% mean confidence interval for inst-and-stalls %-change: -1.07% -1.05% Inst-and-stalls are helped. v2: Rename v3d_qpu_generates_sfu_stalls to v3d_qpu_instr_is_sfu (Eric) Reviewed-by: Eric Anholt --- src/broadcom/compiler/qpu_schedule.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 370881b00ad..c15218e267e 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -560,6 +560,11 @@ mux_read_stalls(struct choose_scoreboard *scoreboard, scoreboard->last_stallable_sfu_reg); } +/* We define a max schedule priority to allow negative priorities as result of + * substracting this max when an instruction stalls. So instructions that + * stall have lower priority than regular instructions. */ +#define MAX_SCHEDULE_PRIORITY 16 + static int get_instruction_priority(const struct v3d_qpu_instr *inst) { @@ -578,10 +583,6 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) return next_score; next_score++; - /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 - * instructions after the producer if possible, not just 1. - */ - /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; @@ -591,6 +592,9 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) return next_score; next_score++; + /* We should increase the maximum if we assert here */ + assert(next_score < MAX_SCHEDULE_PRIORITY); + return baseline_score; } @@ -845,6 +849,18 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, int prio = get_instruction_priority(inst); + if (mux_read_stalls(scoreboard, inst)) { + /* Don't merge an instruction that stalls */ + if (prev_inst) + continue; + else { + /* Any instruction that don't stall will have + * higher scheduling priority */ + prio -= MAX_SCHEDULE_PRIORITY; + assert(prio < 0); + } + } + /* Found a valid instruction. If nothing better comes along, * this one works. */ @@ -1005,6 +1021,9 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after) after_inst)); } + if (v3d_qpu_instr_is_sfu(before_inst)) + return 2; + return latency; } -- 2.30.2