From 97e2aa31c6e735d5f6bd1d67a4dd8da2605aedc8 Mon Sep 17 00:00:00 2001 From: Henri Verbeet Date: Fri, 7 Jan 2011 17:06:11 +0100 Subject: [PATCH] r600g: Split ALU clauses based on used constant cache lines. --- src/gallium/drivers/r600/r600_asm.c | 146 +++++++++++++++++++++---- src/gallium/drivers/r600/r600_shader.c | 4 +- 2 files changed, 129 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index fd0e06e6c37..326724520b3 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -348,11 +348,12 @@ int check_read_slots(struct r600_bc *bc, struct r600_bc_alu *alu_first) } #endif +/* CB constants start at 512, and get translated to a kcache index when ALU + * clauses are constructed. Note that we handle kcache constants the same way + * as (the now gone) cfile constants, is that really required? */ static int is_const(int sel) { - if (sel > 255 && sel < 512) - return 1; - if (sel >= V_SQ_ALU_SRC_0 && sel <= V_SQ_ALU_SRC_LITERAL) + if (sel > 511 && sel < 4607) return 1; return 0; } @@ -413,6 +414,120 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *al return 0; } +/* This code handles kcache lines as single blocks of 32 constants. We could + * probably do slightly better by recognizing that we actually have two + * consecutive lines of 16 constants, but the resulting code would also be + * somewhat more complicated. */ +static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type) +{ + struct r600_bc_kcache *kcache = bc->cf_last->kcache; + unsigned int required_lines; + unsigned int free_lines = 0; + unsigned int cache_line[3]; + unsigned int count = 0; + unsigned int i, j; + int r; + + /* Collect required cache lines. */ + for (i = 0; i < 3; ++i) { + bool found = false; + unsigned int line; + + if (alu->src[i].sel < 512) + continue; + + line = ((alu->src[i].sel - 512) / 32) * 2; + + for (j = 0; j < count; ++j) { + if (cache_line[j] == line) { + found = true; + break; + } + } + + if (!found) + cache_line[count++] = line; + } + + /* This should never actually happen. */ + if (count >= 3) return -ENOMEM; + + for (i = 0; i < 2; ++i) { + if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) { + ++free_lines; + } + } + + /* Filter lines pulled in by previous intructions. Note that this is + * only for the required_lines count, we can't remove these from the + * cache_line array since we may have to start a new ALU clause. */ + for (i = 0, required_lines = count; i < count; ++i) { + for (j = 0; j < 2; ++j) { + if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 && + kcache[j].addr == cache_line[i]) { + --required_lines; + break; + } + } + } + + /* Start a new ALU clause if needed. */ + if (required_lines > free_lines) { + if ((r = r600_bc_add_cf(bc))) { + return r; + } + bc->cf_last->inst = (type << 3); + kcache = bc->cf_last->kcache; + } + + /* Setup the kcache lines. */ + for (i = 0; i < count; ++i) { + bool found = false; + + for (j = 0; j < 2; ++j) { + if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 && + kcache[j].addr == cache_line[i]) { + found = true; + break; + } + } + + if (found) continue; + + for (j = 0; j < 2; ++j) { + if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) { + kcache[j].bank = 0; + kcache[j].addr = cache_line[i]; + kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2; + break; + } + } + } + + /* Alter the src operands to refer to the kcache. */ + for (i = 0; i < 3; ++i) { + static const unsigned int base[] = {128, 160, 256, 288}; + unsigned int line; + + if (alu->src[i].sel < 512) + continue; + + alu->src[i].sel -= 512; + line = (alu->src[i].sel / 32) * 2; + + for (j = 0; j < 2; ++j) { + if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 && + kcache[j].addr == line) { + alu->src[i].sel &= 0x1f; + alu->src[i].sel += base[j]; + break; + } + } + } + + return 0; +} + int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type) { struct r600_bc_alu *nalu = r600_bc_alu(); @@ -434,6 +549,14 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int } bc->cf_last->inst = (type << 3); } + + /* Setup the kcache for this ALU instruction. This will start a new + * ALU clause if needed. */ + if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) { + free(nalu); + return r; + } + if (!bc->cf_last->curr_bs_head) { bc->cf_last->curr_bs_head = nalu; LIST_INITHEAD(&nalu->bs_list); @@ -473,23 +596,6 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int bc->cf_last->ndw += 2; bc->ndw += 2; - /* The following configuration provides 64 128-bit constants. - * Each cacheline holds 16 128-bit constants and each - * kcache can lock 2 cachelines and there are 2 kcaches per - * ALU clause for a max of 64 constants. - * For supporting more than 64 constants, the code needs - * to be broken down into multiple ALU clauses. - */ - /* select the constant buffer (0-15) for each kcache */ - bc->cf_last->kcache[0].bank = 0; - bc->cf_last->kcache[1].bank = 0; - /* lock 2 cachelines per kcache; 4 total */ - bc->cf_last->kcache[0].mode = V_SQ_CF_KCACHE_LOCK_2; - bc->cf_last->kcache[1].mode = V_SQ_CF_KCACHE_LOCK_2; - /* set the cacheline offsets for each kcache */ - bc->cf_last->kcache[0].addr = 0; - bc->cf_last->kcache[1].addr = 2; - /* process cur ALU instructions for bank swizzle */ if (nalu->last) { check_and_set_bank_swizzle(bc, bc->cf_last->curr_bs_head); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index bb5038c49b0..5c089f48896 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -540,7 +540,9 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + ctx.info.file_count[TGSI_FILE_OUTPUT]; - ctx.file_offset[TGSI_FILE_CONSTANT] = 128; + /* Outside the GPR range. This will be translated to one of the + * kcache banks later. */ + ctx.file_offset[TGSI_FILE_CONSTANT] = 512; ctx.file_offset[TGSI_FILE_IMMEDIATE] = 253; ctx.temp_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + -- 2.30.2