From 5278c64de58b545dfe3272b005b331fd5b71da68 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 18 Dec 2015 11:30:30 -0800 Subject: [PATCH] vc4: Fix latency handling for QPU texture scheduling. There's only high latency between a complete texture fetch setup and collecting its result, not between each step of setting up the texture fetch request. --- src/gallium/drivers/vc4/vc4_qpu_schedule.c | 82 +++++++++++++--------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 76cad2e03fe..09164b74932 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -50,7 +50,7 @@ struct schedule_node { uint32_t child_array_size; uint32_t parent_count; - /* Longest cycles + n->latency of any parent of this node. */ + /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; /** @@ -624,6 +624,46 @@ dump_state(struct list_head *schedule_list) } } +static uint32_t waddr_latency(uint32_t waddr, uint64_t after) +{ + if (waddr < 32) + return 2; + + /* Apply some huge latency between texture fetch requests and getting + * their results back. + */ + if (waddr == QPU_W_TMU0_S) { + if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) + return 100; + } + if (waddr == QPU_W_TMU1_S) { + if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1) + return 100; + } + + switch(waddr) { + case QPU_W_SFU_RECIP: + case QPU_W_SFU_RECIPSQRT: + case QPU_W_SFU_EXP: + case QPU_W_SFU_LOG: + return 3; + default: + return 1; + } +} + +static uint32_t +instruction_latency(struct schedule_node *before, struct schedule_node *after) +{ + uint64_t before_inst = before->inst->inst; + uint64_t after_inst = after->inst->inst; + + return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD), + after_inst), + waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL), + after_inst)); +} + /** Recursive computation of the delay member of a node. */ static void compute_delay(struct schedule_node *n) @@ -635,7 +675,8 @@ compute_delay(struct schedule_node *n) if (!n->children[i].node->delay) compute_delay(n->children[i].node); n->delay = MAX2(n->delay, - n->children[i].node->delay + n->latency); + n->children[i].node->delay + + instruction_latency(n, n->children[i].node)); } } } @@ -664,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list, * immediately after (or paired with!) the thing reading the * destination. */ - int latency_from_previous = war_only ? 0 : node->latency; + uint32_t latency = 0; + if (!war_only) { + latency = instruction_latency(node, + node->children[i].node); + } + child->unblocked_time = MAX2(child->unblocked_time, - time + latency_from_previous); + time + latency); child->parent_count--; if (child->parent_count == 0) list_add(&child->link, schedule_list); @@ -799,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) return time; } -static uint32_t waddr_latency(uint32_t waddr) -{ - if (waddr < 32) - return 2; - - /* Some huge number, really. */ - if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B) - return 100; - - switch(waddr) { - case QPU_W_SFU_RECIP: - case QPU_W_SFU_RECIPSQRT: - case QPU_W_SFU_EXP: - case QPU_W_SFU_LOG: - return 3; - default: - return 1; - } -} - -static uint32_t -instruction_latency(uint64_t inst) -{ - return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)), - waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); -} - uint32_t qpu_schedule_instructions(struct vc4_compile *c) { @@ -852,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c) struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); n->inst = inst; - n->latency = instruction_latency(inst->inst); if (reads_uniform(inst->inst)) { n->uniform = next_uniform++; -- 2.30.2