From d649bf51ec787021f7872e2a4c09fb2188c0891b Mon Sep 17 00:00:00 2001 From: Vadim Girlin Date: Fri, 20 Jan 2012 23:24:32 +0400 Subject: [PATCH] r600g: improve kcache line sets handling v2 Add support for multiple kcache banks (constant buffers). Lock the required lines only. Allow up to 4 kcache line sets in the alu clause by using ALU_EXTENDED on eg+. Signed-off-by: Vadim Girlin Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/eg_asm.c | 17 ++ src/gallium/drivers/r600/eg_sq.h | 49 +++++- src/gallium/drivers/r600/r600_asm.c | 232 ++++++++++++++++++---------- src/gallium/drivers/r600/r600_asm.h | 4 +- 4 files changed, 216 insertions(+), 86 deletions(-) diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c index 877e162d841..e867ea4cae8 100644 --- a/src/gallium/drivers/r600/eg_asm.c +++ b/src/gallium/drivers/r600/eg_asm.c @@ -38,6 +38,23 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER: case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER: case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE: + /* prepend ALU_EXTENDED if we need more than 2 kcache sets */ + if (cf->eg_alu_extended) { + bc->bytecode[id++] = + S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(V_SQ_CF_INDEX_NONE) | + S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(V_SQ_CF_INDEX_NONE) | + S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(V_SQ_CF_INDEX_NONE) | + S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(V_SQ_CF_INDEX_NONE) | + S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(cf->kcache[2].bank) | + S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(cf->kcache[3].bank) | + S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(cf->kcache[2].mode); + bc->bytecode[id++] = EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_EXTENDED | + S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(cf->kcache[3].mode) | + S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(cf->kcache[2].addr) | + S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(cf->kcache[3].addr) | + S_SQ_CF_ALU_WORD1_EXT_BARRIER(1); + } + bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) | S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) | S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) | diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h index 854c1b8911a..eba42d09a06 100644 --- a/src/gallium/drivers/r600/eg_sq.h +++ b/src/gallium/drivers/r600/eg_sq.h @@ -78,6 +78,10 @@ #define S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x) (((x) & 0x3) << 30) #define G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x) (((x) >> 30) & 0x3) #define C_SQ_CF_ALU_WORD0_KCACHE_MODE0 0x3FFFFFFF +#define V_SQ_CF_KCACHE_NOP 0x00000000 +#define V_SQ_CF_KCACHE_LOCK_1 0x00000001 +#define V_SQ_CF_KCACHE_LOCK_2 0x00000002 +#define V_SQ_CF_KCACHE_LOCK_LOOP_INDEX 0x00000003 #define P_SQ_CF_ALU_WORD1 #define S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x) (((x) & 0x3) << 0) #define G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x) (((x) >> 0) & 0x3) @@ -103,7 +107,50 @@ #define S_SQ_CF_ALU_WORD1_BARRIER(x) (((x) & 0x1) << 31) #define G_SQ_CF_ALU_WORD1_BARRIER(x) (((x) >> 31) & 0x1) #define C_SQ_CF_ALU_WORD1_BARRIER 0x7FFFFFFF -/* extended TODO */ + +#define P_SQ_CF_ALU_WORD0_EXT +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x) (((x) & 0x3) << 4) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x) (((x) >> 4) & 0x3) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0 0xFFFFFFCF +#define V_SQ_CF_INDEX_NONE 0x00 +#define V_SQ_CF_INDEX_0 0x01 +#define V_SQ_CF_INDEX_1 0x02 +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x) (((x) & 0x3) << 6) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x) (((x) >> 6) & 0x3) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1 0xFFFFFF3F +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x) (((x) & 0x3) << 8) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x) (((x) >> 8) & 0x3) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2 0xFFFFFCFF +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x) (((x) & 0x3) << 10) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x) (((x) >> 10) & 0x3) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3 0xFFFFF3FF +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x) (((x) & 0xF) << 22) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x) (((x) >> 22) & 0xF) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2 0xFC3FFFFF +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x) (((x) & 0xF) << 26) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x) (((x) >> 26) & 0xF) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3 0xC3FFFFFF +#define S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x) (((x) & 0x3) << 30) +#define G_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x) (((x) >> 30) & 0x3) +#define C_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2 0x3FFFFFFF + +#define P_SQ_CF_ALU_WORD1_EXT +#define S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x) (((x) & 0x3) << 0) +#define G_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x) (((x) >> 0) & 0x3) +#define C_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3 0xFFFFFFFC +#define S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x) (((x) & 0xFF) << 2) +#define G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x) (((x) >> 2) & 0xFF) +#define C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2 0xFFFFFC03 +#define S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x) (((x) & 0xFF) << 10) +#define G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x) (((x) >> 10) & 0xFF) +#define C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3 0xFFFC03FF +#define S_SQ_CF_ALU_WORD1_EXT_CF_INST(x) (((x) & 0xF) << 26) +#define G_SQ_CF_ALU_WORD1_EXT_CF_INST(x) (((x) >> 26) & 0xF) +#define C_SQ_CF_ALU_WORD1_EXT_CF_INST 0xC3FFFFFF +#define S_SQ_CF_ALU_WORD1_EXT_BARRIER(x) (((x) & 0x1) << 31) +#define G_SQ_CF_ALU_WORD1_EXT_BARRIER(x) (((x) >> 31) & 0x1) +#define C_SQ_CF_ALU_WORD1_EXT_BARRIER 0x7FFFFFFF + /* done */ #define P_SQ_CF_ALLOC_EXPORT_WORD0 #define S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x) (((x) & 0x1FFF) << 0) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 107c864f2a5..6fd3a9137f2 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -273,8 +273,14 @@ static int r600_bytecode_add_cf(struct r600_bytecode *bc) if (cf == NULL) return -ENOMEM; LIST_ADDTAIL(&cf->list, &bc->cf); - if (bc->cf_last) + if (bc->cf_last) { cf->id = bc->cf_last->id + 2; + if (bc->cf_last->eg_alu_extended) { + /* take into account extended alu size */ + cf->id += 2; + bc->ndw += 2; + } + } bc->cf_last = cf; bc->ncf++; bc->ndw += 2; @@ -1154,116 +1160,157 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu return 0; } -/* This code handles kcache lines as single blocks of 32 constants. We could - * probably do slightly better by recognizing that we actually have two - * consecutive lines of 16 constants, but the resulting code would also be - * somewhat more complicated. */ -static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type) +/* we'll keep kcache sets sorted by bank & addr */ +static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc, + struct r600_bytecode_kcache *kcache, + unsigned bank, unsigned line) { - struct r600_bytecode_kcache *kcache = bc->cf_last->kcache; - unsigned int required_lines; - unsigned int free_lines = 0; - unsigned int cache_line[3]; - unsigned int count = 0; - unsigned int i, j; - int r; + int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2; - /* Collect required cache lines. */ - for (i = 0; i < 3; ++i) { - boolean found = false; - unsigned int line; + for (i = 0; i < kcache_banks; i++) { + if (kcache[i].mode) { + int d; - if (alu->src[i].sel < 512) - continue; + if (kcache[i].bank < bank) + continue; - line = ((alu->src[i].sel - 512) / 32) * 2; + if ((kcache[i].bank == bank && kcache[i].addr > line+1) || + kcache[i].bank > bank) { + /* try to insert new line */ + if (kcache[kcache_banks-1].mode) { + /* all sets are in use */ + return -ENOMEM; + } - for (j = 0; j < count; ++j) { - if (cache_line[j] == line) { - found = true; - break; + memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache)); + kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; + kcache[i].bank = bank; + kcache[i].addr = line; + return 0; } - } - if (!found) - cache_line[count++] = line; - } + d = line - kcache[i].addr; - /* This should never actually happen. */ - if (count >= 3) return -ENOMEM; - - for (i = 0; i < 2; ++i) { - if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) { - ++free_lines; - } - } - - /* Filter lines pulled in by previous intructions. Note that this is - * only for the required_lines count, we can't remove these from the - * cache_line array since we may have to start a new ALU clause. */ - for (i = 0, required_lines = count; i < count; ++i) { - for (j = 0; j < 2; ++j) { - if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 && - kcache[j].addr == cache_line[i]) { - --required_lines; - break; - } + if (d == -1) { + kcache[i].addr--; + if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) { + /* we are prepending the line to the current set, + * discarding the existing second line, + * so we'll have to insert line+2 after it */ + line += 2; + continue; + } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) { + kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; + return 0; + } else { + /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */ + return -ENOMEM; + } + } else if (d == 1) { + kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; + return 0; + } else if (d == 0) + return 0; + } else { /* free kcache set - use it */ + kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; + kcache[i].bank = bank; + kcache[i].addr = line; + return 0; } } + return -ENOMEM; +} - /* Start a new ALU clause if needed. */ - if (required_lines > free_lines) { - if ((r = r600_bytecode_add_cf(bc))) { - return r; - } - bc->cf_last->inst = type; - kcache = bc->cf_last->kcache; - } +static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc, + struct r600_bytecode_kcache *kcache, + struct r600_bytecode_alu *alu) +{ + int i, r; - /* Setup the kcache lines. */ - for (i = 0; i < count; ++i) { - boolean found = false; + for (i = 0; i < 3; i++) { + unsigned bank, line, sel = alu->src[i].sel; - for (j = 0; j < 2; ++j) { - if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 && - kcache[j].addr == cache_line[i]) { - found = true; - break; - } - } + if (sel < 512) + continue; - if (found) continue; + bank = alu->src[i].kc_bank; + line = (sel-512)>>4; - for (j = 0; j < 2; ++j) { - if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) { - kcache[j].bank = 0; - kcache[j].addr = cache_line[i]; - kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2; - break; - } - } + if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line))) + return r; } + return 0; +} + +static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc, + struct r600_bytecode_alu *alu, + struct r600_bytecode_kcache * kcache) +{ + int i, j; /* Alter the src operands to refer to the kcache. */ for (i = 0; i < 3; ++i) { static const unsigned int base[] = {128, 160, 256, 288}; - unsigned int line; + unsigned int line, sel = alu->src[i].sel, found = 0; - if (alu->src[i].sel < 512) + if (sel < 512) continue; - alu->src[i].sel -= 512; - line = (alu->src[i].sel / 32) * 2; + sel -= 512; + line = sel>>4; - for (j = 0; j < 2; ++j) { - if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 && - kcache[j].addr == line) { - alu->src[i].sel &= 0x1f; - alu->src[i].sel += base[j]; - break; + for (j = 0; j < 4 && !found; ++j) { + switch (kcache[j].mode) { + case V_SQ_CF_KCACHE_NOP: + case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX: + R600_ERR("unexpected kcache line mode\n"); + return -ENOMEM; + default: + if (kcache[j].bank == alu->src[i].kc_bank && + kcache[j].addr <= line && + line < kcache[j].addr + kcache[j].mode) { + alu->src[i].sel = sel - (kcache[j].addr<<4); + alu->src[i].sel += base[j]; + found=1; + } } } } + return 0; +} + +static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type) +{ + struct r600_bytecode_kcache kcache_sets[4]; + struct r600_bytecode_kcache *kcache = kcache_sets; + int r; + + memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache)); + + if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { + /* can't alloc, need to start new clause */ + if ((r = r600_bytecode_add_cf(bc))) { + return r; + } + bc->cf_last->inst = type; + + /* retry with the new clause */ + kcache = bc->cf_last->kcache; + if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { + /* can't alloc again- should never happen */ + return r; + } + } else { + /* update kcache sets */ + memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache)); + } + + /* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */ + if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) { + if (bc->chip_class < EVERGREEN) + return -ENOMEM; + bc->cf_last->eg_alu_extended = 1; + } return 0; } @@ -1933,6 +1980,8 @@ int r600_bytecode_build(struct r600_bytecode *bc) if (r) return r; r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral); + r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache); + switch(bc->chip_class) { case EVERGREEN: /* eg alu is same encoding as r700 */ case CAYMAN: @@ -2028,6 +2077,8 @@ int r600_bytecode_build(struct r600_bytecode *bc) if (r) return r; r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral); + r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache); + switch(bc->chip_class) { case R600: r = r600_bytecode_alu_build(bc, alu, addr); @@ -2168,6 +2219,19 @@ void r600_bytecode_dump(struct r600_bytecode *bc) case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER: case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER: case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE: + if (cf->eg_alu_extended) { + fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]); + fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank); + fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank); + fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode); + id++; + fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]); + fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode); + fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr); + fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr); + id++; + } + fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]); fprintf(stderr, "ADDR:%d ", cf->addr); fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode); diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 00f7e591ac3..a68b299c713 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -32,6 +32,7 @@ struct r600_bytecode_alu_src { unsigned neg; unsigned abs; unsigned rel; + unsigned kc_bank; uint32_t value; }; @@ -144,8 +145,9 @@ struct r600_bytecode_cf { unsigned cond; unsigned pop_count; unsigned cf_addr; /* control flow addr */ - struct r600_bytecode_kcache kcache[2]; + struct r600_bytecode_kcache kcache[4]; unsigned r6xx_uses_waterfall; + unsigned eg_alu_extended; struct list_head alu; struct list_head tex; struct list_head vtx; -- 2.30.2