From 755037173d19b65777a97f55455c1f64bf618264 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 10 Nov 2016 15:10:15 -0800 Subject: [PATCH] vc4: Add support for register allocation for threaded shaders. We have two major requirements: Make sure that only the bottom half of the physical reg space is used, and make sure that none of our values are live in an accumulator across a switch. --- src/gallium/drivers/vc4/vc4_context.h | 9 +- src/gallium/drivers/vc4/vc4_qir.h | 7 ++ .../drivers/vc4/vc4_register_allocate.c | 89 +++++++++++++++---- 3 files changed, 85 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index aebbd81d2bf..3a8ae827b85 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -318,11 +318,12 @@ struct vc4_context { uint64_t next_compiled_program_id; struct ra_regs *regs; - unsigned int reg_class_any; - unsigned int reg_class_a_or_b_or_acc; + unsigned int reg_class_any[2]; + unsigned int reg_class_a_or_b[2]; + unsigned int reg_class_a_or_b_or_acc[2]; unsigned int reg_class_r0_r3; - unsigned int reg_class_r4_or_a; - unsigned int reg_class_a; + unsigned int reg_class_r4_or_a[2]; + unsigned int reg_class_a[2]; uint8_t prim_mode; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 7f321c46868..03ac1f50128 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -533,6 +533,13 @@ struct vc4_compile { uint32_t program_id; uint32_t variant_id; + + /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH + * is used to hide texturing latency at the cost of limiting ourselves + * to the bottom half of physical reg space. + */ + bool fs_threaded; + bool failed; }; diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index ad1bc218c08..d6bf71c9724 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -115,22 +115,33 @@ vc4_alloc_reg_set(struct vc4_context *vc4) vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true); - vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); - vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs); - vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs); - vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); + /* The physical regfiles split us into two classes, with [0] being the + * whole space and [1] being the bottom half (for threaded fragment + * shaders). + */ + for (int i = 0; i < 2; i++) { + vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs); + } vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs); /* r0-r3 */ for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) { ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); } /* R4 gets a special class because it can't be written as a general * purpose register. (it's TMU_NOSWAP as a write address). */ - ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, ACC_INDEX + 4); + for (int i = 0; i < 2; i++) { + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i], + ACC_INDEX + 4); + } /* A/B */ for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) { @@ -140,13 +151,28 @@ vc4_alloc_reg_set(struct vc4_context *vc4) if (vc4_regs[i].addr == 31) continue; - ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); + + if (vc4_regs[i].addr < 16) { + ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); + } + /* A only */ if (((i - AB_INDEX) & 1) == 0) { - ra_class_add_reg(vc4->regs, vc4->reg_class_a, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i); + + if (vc4_regs[i].addr < 16) { + ra_class_add_reg(vc4->regs, + vc4->reg_class_a[1], i); + ra_class_add_reg(vc4->regs, + vc4->reg_class_r4_or_a[1], i); + } } } @@ -252,6 +278,17 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3; break; + case QOP_THRSW: + /* All accumulators are invalidated across a thread + * switch. + */ + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) + class_bits[i] &= ~(CLASS_BIT_R0_R3 | + CLASS_BIT_R4); + } + break; + default: break; } @@ -288,21 +325,38 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) switch (class_bits[i]) { case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3: - ra_set_node_class(g, node, vc4->reg_class_any); + ra_set_node_class(g, node, + vc4->reg_class_any[c->fs_threaded]); + break; + case CLASS_BIT_A | CLASS_BIT_B: + ra_set_node_class(g, node, + vc4->reg_class_a_or_b[c->fs_threaded]); break; case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3: - ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc); + ra_set_node_class(g, node, + vc4->reg_class_a_or_b_or_acc[c->fs_threaded]); break; case CLASS_BIT_A | CLASS_BIT_R4: - ra_set_node_class(g, node, vc4->reg_class_r4_or_a); + ra_set_node_class(g, node, + vc4->reg_class_r4_or_a[c->fs_threaded]); break; case CLASS_BIT_A: - ra_set_node_class(g, node, vc4->reg_class_a); + ra_set_node_class(g, node, + vc4->reg_class_a[c->fs_threaded]); break; case CLASS_BIT_R0_R3: ra_set_node_class(g, node, vc4->reg_class_r0_r3); break; + default: + /* DDX/DDY used across thread switched might get us + * here. + */ + if (c->fs_threaded) { + c->failed = true; + return NULL; + } + fprintf(stderr, "temp %d: bad class bits: 0x%x\n", i, class_bits[i]); abort(); @@ -323,8 +377,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) bool ok = ra_allocate(g); if (!ok) { - fprintf(stderr, "Failed to register allocate:\n"); - qir_dump(c); + if (!c->fs_threaded) { + fprintf(stderr, "Failed to register allocate:\n"); + qir_dump(c); + } + c->failed = true; return NULL; } -- 2.30.2