From 87a88f2daabfe14b12d447b3d96b9f8938c5cf03 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 22 Aug 2016 14:58:28 -0700 Subject: [PATCH] vc4: Fix GPU hangs with >16 varying values. Fixes glsl-routing in piglit and hangs in glbenchmark 2.0.2. --- src/gallium/drivers/vc4/vc4_qir.h | 12 +++++ src/gallium/drivers/vc4/vc4_qpu_emit.c | 75 +++++++++++++++++++------- 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 12b081bc6ab..6b48959d257 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -494,6 +494,18 @@ struct vc4_compile { uint32_t qpu_inst_size; uint32_t num_inputs; + /** + * Number of inputs from num_inputs remaining to be queued to the read + * FIFO in the VS/CS. + */ + uint32_t num_inputs_remaining; + + /* Number of inputs currently in the read FIFO for the VS/CS */ + uint32_t num_inputs_in_fifo; + + /** Next offset in the VPM to read from in the VS/CS */ + uint32_t vpm_read_offset; + uint32_t program_id; uint32_t variant_id; }; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 6a10e1b68de..82499296b5b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -96,6 +96,60 @@ swap_file(struct qpu_reg *src) } } +/** + * Sets up the VPM read FIFO before we do any VPM read. + * + * VPM reads (vertex attribute input) and VPM writes (varyings output) from + * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the + * VPM block. In the VS/CS (unlike in the FS), the block starts out + * uninitialized, and you need to emit setup to the block before any VPM + * reads/writes. + * + * VRI has a FIFO in each direction, with each FIFO able to hold four + * 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM + * writes go through the write FIFO. The read/write setup values from QPU go + * through the write FIFO as well, with a sideband signal indicating that + * they're setup values. Once a read setup reaches the other side of the + * FIFO, the VPM block will start asynchronously reading vertex attributes and + * filling the read FIFO -- that way hopefully the QPU doesn't have to block + * on reads later. + * + * VPM read setup can configure 16 32-bit-per-vertex values to be read at a + * time, which is 4 vec4s. If more than that is being read (since we support + * 8 vec4 vertex attributes), then multiple read setup writes need to be done. + * + * The existence of the FIFO makes it seem like you should be able to emit + * both setups for the 5-8 attribute cases and then do all the attribute + * reads. However, once the setup value makes it to the other end of the + * write FIFO, it will immediately update the VPM block's setup register. + * That updated setup register would be used for read FIFO fills from then on, + * breaking whatever remaining VPM values were supposed to be read into the + * read FIFO from the previous attribute set. + * + * As a result, we need to emit the read setup, pull every VPM read value from + * that setup, and only then emit the second setup if applicable. + */ +static void +setup_for_vpm_read(struct vc4_compile *c, struct qblock *block) +{ + if (c->num_inputs_in_fifo) { + c->num_inputs_in_fifo--; + return; + } + + c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16); + + queue(block, + qpu_load_imm_ui(qpu_vrsetup(), + c->vpm_read_offset | + 0x00001a00 | + ((c->num_inputs_in_fifo & 0xf) << 20))); + c->num_inputs_remaining -= c->num_inputs_in_fifo; + c->vpm_read_offset += c->num_inputs_in_fifo; + + c->num_inputs_in_fifo--; +} + /** * This is used to resolve the fact that we might register-allocate two * different operands of an instruction to the same physical register file @@ -268,6 +322,7 @@ vc4_generate_code_block(struct vc4_compile *c, assert(src[i].addr <= 47); break; case QFILE_VPM: + setup_for_vpm_read(c, block); assert((int)qinst->src[i].index >= last_vpm_read_index); (void)last_vpm_read_index; @@ -484,31 +539,13 @@ void vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) { struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c); - uint32_t inputs_remaining = c->num_inputs; - uint32_t vpm_read_fifo_count = 0; - uint32_t vpm_read_offset = 0; struct qblock *start_block = list_first_entry(&c->blocks, struct qblock, link); switch (c->stage) { case QSTAGE_VERT: case QSTAGE_COORD: - /* There's a 4-entry FIFO for VPMVCD reads, each of which can - * load up to 16 dwords (4 vec4s) per vertex. - */ - while (inputs_remaining) { - uint32_t num_entries = MIN2(inputs_remaining, 16); - queue(start_block, - qpu_load_imm_ui(qpu_vrsetup(), - vpm_read_offset | - 0x00001a00 | - ((num_entries & 0xf) << 20))); - inputs_remaining -= num_entries; - vpm_read_offset += num_entries; - vpm_read_fifo_count++; - } - assert(vpm_read_fifo_count <= 4); - + c->num_inputs_remaining = c->num_inputs; queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00)); break; case QSTAGE_FRAG: -- 2.30.2