r600g: improve kcache line sets handling v2
authorVadim Girlin <vadimgirlin@gmail.com>
Fri, 20 Jan 2012 19:24:32 +0000 (23:24 +0400)
committerDave Airlie <airlied@redhat.com>
Sat, 21 Jan 2012 12:43:14 +0000 (12:43 +0000)
Add support for multiple kcache banks (constant buffers).
Lock the required lines only.
Allow up to 4 kcache line sets in the alu clause by using ALU_EXTENDED on eg+.

Signed-off-by: Vadim Girlin <vadimgirlin@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
src/gallium/drivers/r600/eg_asm.c
src/gallium/drivers/r600/eg_sq.h
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_asm.h

index 877e162d841b2ad090ef0da6b710c56538a13411..e867ea4cae891113300aec4c7cfe5f0415e4c8a9 100644 (file)
@@ -38,6 +38,23 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+               /* prepend ALU_EXTENDED if we need more than 2 kcache sets */
+               if (cf->eg_alu_extended) {
+                       bc->bytecode[id++] =
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(V_SQ_CF_INDEX_NONE) |
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(V_SQ_CF_INDEX_NONE) |
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(V_SQ_CF_INDEX_NONE) |
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(V_SQ_CF_INDEX_NONE) |
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(cf->kcache[2].bank) |
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(cf->kcache[3].bank) |
+                               S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(cf->kcache[2].mode);
+                       bc->bytecode[id++] = EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_EXTENDED |
+                               S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(cf->kcache[3].mode) |
+                               S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(cf->kcache[2].addr) |
+                               S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(cf->kcache[3].addr) |
+                               S_SQ_CF_ALU_WORD1_EXT_BARRIER(1);
+               }
+
                bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
                        S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
                        S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
index 854c1b8911a1adac002532f5a768897645bdbd62..eba42d09a0693a8f4bd5a5e8fbc571f11d5c4357 100644 (file)
 #define   S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) & 0x3) << 30)
 #define   G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) >> 30) & 0x3)
 #define   C_SQ_CF_ALU_WORD0_KCACHE_MODE0                             0x3FFFFFFF
+#define     V_SQ_CF_KCACHE_NOP                                       0x00000000
+#define     V_SQ_CF_KCACHE_LOCK_1                                    0x00000001
+#define     V_SQ_CF_KCACHE_LOCK_2                                    0x00000002
+#define     V_SQ_CF_KCACHE_LOCK_LOOP_INDEX                           0x00000003
 #define P_SQ_CF_ALU_WORD1
 #define   S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) & 0x3) << 0)
 #define   G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) >> 0) & 0x3)
 #define   S_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) & 0x1) << 31)
 #define   G_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) >> 31) & 0x1)
 #define   C_SQ_CF_ALU_WORD1_BARRIER                                  0x7FFFFFFF
-/* extended TODO */
+
+#define P_SQ_CF_ALU_WORD0_EXT
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) & 0x3) << 4)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) >> 4) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0              0xFFFFFFCF
+#define            V_SQ_CF_INDEX_NONE                                       0x00
+#define            V_SQ_CF_INDEX_0                                          0x01
+#define            V_SQ_CF_INDEX_1                                          0x02
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) & 0x3) << 6)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) >> 6) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1              0xFFFFFF3F
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) & 0x3) << 8)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) >> 8) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2              0xFFFFFCFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) & 0x3) << 10)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) >> 10) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3              0xFFFFF3FF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) & 0xF) << 22)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) >> 22) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2                         0xFC3FFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3                         0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2                         0x3FFFFFFF
+
+#define P_SQ_CF_ALU_WORD1_EXT
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) & 0x3) << 0)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) >> 0) & 0x3)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3                         0xFFFFFFFC
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) & 0xFF) << 2)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) >> 2) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2                         0xFFFFFC03
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) & 0xFF) << 10)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) >> 10) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3                         0xFFFC03FF
+#define   S_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD1_EXT_CF_INST                              0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_EXT_BARRIER                              0x7FFFFFFF
+
 /* done */
 #define P_SQ_CF_ALLOC_EXPORT_WORD0
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) & 0x1FFF) << 0)
index 107c864f2a5142e47428aa59e59dd612de600fa7..6fd3a9137f2214fd82ea68a59aaf17f15482ccba 100644 (file)
@@ -273,8 +273,14 @@ static int r600_bytecode_add_cf(struct r600_bytecode *bc)
        if (cf == NULL)
                return -ENOMEM;
        LIST_ADDTAIL(&cf->list, &bc->cf);
-       if (bc->cf_last)
+       if (bc->cf_last) {
                cf->id = bc->cf_last->id + 2;
+               if (bc->cf_last->eg_alu_extended) {
+                       /* take into account extended alu size */
+                       cf->id += 2;
+                       bc->ndw += 2;
+               }
+       }
        bc->cf_last = cf;
        bc->ncf++;
        bc->ndw += 2;
@@ -1154,116 +1160,157 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
        return 0;
 }
 
-/* This code handles kcache lines as single blocks of 32 constants. We could
- * probably do slightly better by recognizing that we actually have two
- * consecutive lines of 16 constants, but the resulting code would also be
- * somewhat more complicated. */
-static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+/* we'll keep kcache sets sorted by bank & addr */
+static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
+               struct r600_bytecode_kcache *kcache,
+               unsigned bank, unsigned line)
 {
-       struct r600_bytecode_kcache *kcache = bc->cf_last->kcache;
-       unsigned int required_lines;
-       unsigned int free_lines = 0;
-       unsigned int cache_line[3];
-       unsigned int count = 0;
-       unsigned int i, j;
-       int r;
+       int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
 
-       /* Collect required cache lines. */
-       for (i = 0; i < 3; ++i) {
-               boolean found = false;
-               unsigned int line;
+       for (i = 0; i < kcache_banks; i++) {
+               if (kcache[i].mode) {
+                       int d;
 
-               if (alu->src[i].sel < 512)
-                       continue;
+                       if (kcache[i].bank < bank)
+                               continue;
 
-               line = ((alu->src[i].sel - 512) / 32) * 2;
+                       if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
+                                       kcache[i].bank > bank) {
+                               /* try to insert new line */
+                               if (kcache[kcache_banks-1].mode) {
+                                       /* all sets are in use */
+                                       return -ENOMEM;
+                               }
 
-               for (j = 0; j < count; ++j) {
-                       if (cache_line[j] == line) {
-                               found = true;
-                               break;
+                               memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
+                               kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+                               kcache[i].bank = bank;
+                               kcache[i].addr = line;
+                               return 0;
                        }
-               }
 
-               if (!found)
-                       cache_line[count++] = line;
-       }
+                       d = line - kcache[i].addr;
 
-       /* This should never actually happen. */
-       if (count >= 3) return -ENOMEM;
-
-       for (i = 0; i < 2; ++i) {
-               if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
-                       ++free_lines;
-               }
-       }
-
-       /* Filter lines pulled in by previous intructions. Note that this is
-        * only for the required_lines count, we can't remove these from the
-        * cache_line array since we may have to start a new ALU clause. */
-       for (i = 0, required_lines = count; i < count; ++i) {
-               for (j = 0; j < 2; ++j) {
-                       if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-                           kcache[j].addr == cache_line[i]) {
-                               --required_lines;
-                               break;
-                       }
+                       if (d == -1) {
+                               kcache[i].addr--;
+                               if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
+                                       /* we are prepending the line to the current set,
+                                        * discarding the existing second line,
+                                        * so we'll have to insert line+2 after it */
+                                       line += 2;
+                                       continue;
+                               } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
+                                       kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+                                       return 0;
+                               } else {
+                                       /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
+                                       return -ENOMEM;
+                               }
+                       } else if (d == 1) {
+                               kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+                               return 0;
+                       } else if (d == 0)
+                               return 0;
+               } else { /* free kcache set - use it */
+                       kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+                       kcache[i].bank = bank;
+                       kcache[i].addr = line;
+                       return 0;
                }
        }
+       return -ENOMEM;
+}
 
-       /* Start a new ALU clause if needed. */
-       if (required_lines > free_lines) {
-               if ((r = r600_bytecode_add_cf(bc))) {
-                       return r;
-               }
-               bc->cf_last->inst = type;
-               kcache = bc->cf_last->kcache;
-       }
+static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
+               struct r600_bytecode_kcache *kcache,
+               struct r600_bytecode_alu *alu)
+{
+       int i, r;
 
-       /* Setup the kcache lines. */
-       for (i = 0; i < count; ++i) {
-               boolean found = false;
+       for (i = 0; i < 3; i++) {
+               unsigned bank, line, sel = alu->src[i].sel;
 
-               for (j = 0; j < 2; ++j) {
-                       if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-                           kcache[j].addr == cache_line[i]) {
-                               found = true;
-                               break;
-                       }
-               }
+               if (sel < 512)
+                       continue;
 
-               if (found) continue;
+               bank = alu->src[i].kc_bank;
+               line = (sel-512)>>4;
 
-               for (j = 0; j < 2; ++j) {
-                       if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
-                               kcache[j].bank = 0;
-                               kcache[j].addr = cache_line[i];
-                               kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
-                               break;
-                       }
-               }
+               if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
+                       return r;
        }
+       return 0;
+}
+
+static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
+               struct r600_bytecode_alu *alu,
+               struct r600_bytecode_kcache * kcache)
+{
+       int i, j;
 
        /* Alter the src operands to refer to the kcache. */
        for (i = 0; i < 3; ++i) {
                static const unsigned int base[] = {128, 160, 256, 288};
-               unsigned int line;
+               unsigned int line, sel = alu->src[i].sel, found = 0;
 
-               if (alu->src[i].sel < 512)
+               if (sel < 512)
                        continue;
 
-               alu->src[i].sel -= 512;
-               line = (alu->src[i].sel / 32) * 2;
+               sel -= 512;
+               line = sel>>4;
 
-               for (j = 0; j < 2; ++j) {
-                       if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-                           kcache[j].addr == line) {
-                               alu->src[i].sel &= 0x1f;
-                               alu->src[i].sel += base[j];
-                               break;
+               for (j = 0; j < 4 && !found; ++j) {
+                       switch (kcache[j].mode) {
+                       case V_SQ_CF_KCACHE_NOP:
+                       case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
+                               R600_ERR("unexpected kcache line mode\n");
+                               return -ENOMEM;
+                       default:
+                               if (kcache[j].bank == alu->src[i].kc_bank &&
+                                               kcache[j].addr <= line &&
+                                               line < kcache[j].addr + kcache[j].mode) {
+                                       alu->src[i].sel = sel - (kcache[j].addr<<4);
+                                       alu->src[i].sel += base[j];
+                                       found=1;
+                           }
                        }
                }
        }
+       return 0;
+}
+
+static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+{
+       struct r600_bytecode_kcache kcache_sets[4];
+       struct r600_bytecode_kcache *kcache = kcache_sets;
+       int r;
+
+       memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
+
+       if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+               /* can't alloc, need to start new clause */
+               if ((r = r600_bytecode_add_cf(bc))) {
+                       return r;
+               }
+               bc->cf_last->inst = type;
+
+               /* retry with the new clause */
+               kcache = bc->cf_last->kcache;
+               if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+                       /* can't alloc again- should never happen */
+                       return r;
+               }
+       } else {
+               /* update kcache sets */
+               memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
+       }
+
+       /* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
+       if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
+               if (bc->chip_class < EVERGREEN)
+                       return -ENOMEM;
+               bc->cf_last->eg_alu_extended = 1;
+       }
 
        return 0;
 }
@@ -1933,6 +1980,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
                                        if (r)
                                                return r;
                                        r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+                                       r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
                                        switch(bc->chip_class) {
                                        case EVERGREEN: /* eg alu is same encoding as r700 */
                                        case CAYMAN:
@@ -2028,6 +2077,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
                                        if (r)
                                                return r;
                                        r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+                                       r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
                                        switch(bc->chip_class) {
                                        case R600:
                                                r = r600_bytecode_alu_build(bc, alu, addr);
@@ -2168,6 +2219,19 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
                        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
                        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
                        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+                               if (cf->eg_alu_extended) {
+                                       fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
+                                       fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
+                                       fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
+                                       fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
+                                       id++;
+                                       fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
+                                       fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
+                                       fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
+                                       fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
+                                       id++;
+                               }
+
                                fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
                                fprintf(stderr, "ADDR:%d ", cf->addr);
                                fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
index 00f7e591ac308508fb7aea3b3966d23a2a26dc04..a68b299c71396011314bac2e1399577c0e20848a 100644 (file)
@@ -32,6 +32,7 @@ struct r600_bytecode_alu_src {
        unsigned                        neg;
        unsigned                        abs;
        unsigned                        rel;
+       unsigned                        kc_bank;
        uint32_t                        value;
 };
 
@@ -144,8 +145,9 @@ struct r600_bytecode_cf {
        unsigned                        cond;
        unsigned                        pop_count;
        unsigned                        cf_addr; /* control flow addr */
-       struct r600_bytecode_kcache             kcache[2];
+       struct r600_bytecode_kcache             kcache[4];
        unsigned                        r6xx_uses_waterfall;
+       unsigned                        eg_alu_extended;
        struct list_head                alu;
        struct list_head                tex;
        struct list_head                vtx;