r600g: don't reserve more stack space than required v5
authorVadim Girlin <vadimgirlin@gmail.com>
Tue, 2 Apr 2013 15:33:40 +0000 (19:33 +0400)
committerVadim Girlin <vadimgirlin@gmail.com>
Tue, 2 Apr 2013 15:34:14 +0000 (19:34 +0400)
Reduced stack size allows to run more threads in some cases,
improving performance for the shaders that use stack (that is, for the
shaders with control flow instructions). E.g. with unigine-based apps.

v4: implement exact computation taking into account wavefront size
v5: add cases for RV620, RS880

Signed-off-by: Vadim Girlin <vadimgirlin@gmail.com>
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_asm.h
src/gallium/drivers/r600/r600_shader.c

index 65c705d0aa820f1562c8d00b5d24812d2236a934..c88b48dc96b953c7d523e611ca13cd906ebeb2cc 100644 (file)
@@ -87,6 +87,40 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
        return tex;
 }
 
+static unsigned stack_entry_size(enum radeon_family chip) {
+       /* Wavefront size:
+        *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
+        *       Aruba/Sumo/Sumo2/redwood/juniper
+        *   32: R630/R730/R710/Palm/Cedar
+        *   16: R610/Rs780
+        *
+        * Stack row size:
+        *      Wavefront Size                        16  32  48  64
+        *      Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
+        *      Columns per Row (R9xx+)                8   4   4   4 */
+
+       switch (chip) {
+       /* FIXME: are some chips missing here? */
+       /* wavefront size 16 */
+       case CHIP_RV610:
+       case CHIP_RS780:
+       case CHIP_RV620:
+       case CHIP_RS880:
+       /* wavefront size 32 */
+       case CHIP_RV630:
+       case CHIP_RV635:
+       case CHIP_RV730:
+       case CHIP_RV710:
+       case CHIP_PALM:
+       case CHIP_CEDAR:
+               return 8;
+
+       /* wavefront size 64 */
+       default:
+               return 4;
+       }
+}
+
 void r600_bytecode_init(struct r600_bytecode *bc,
                        enum chip_class chip_class,
                        enum radeon_family family,
@@ -104,6 +138,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
        LIST_INITHEAD(&bc->cf);
        bc->chip_class = chip_class;
        bc->msaa_texture_mode = msaa_texture_mode;
+       bc->stack.entry_size = stack_entry_size(family);
 }
 
 int r600_bytecode_add_cf(struct r600_bytecode *bc)
@@ -1522,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
        unsigned addr;
        int i, r;
 
-       if (bc->callstack[0].max > 0)
-               bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
+       bc->nstack = bc->stack.max_entries;
+
        if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
                bc->nstack = 1;
        }
@@ -1824,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
                chip = '6';
                break;
        }
-       fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n",
-               bc->ndw, bc->ngpr);
+       fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
+               bc->ndw, bc->ngpr, bc->nstack);
        fprintf(stderr, "shader %d -- %c\n", index++, chip);
 
        LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
index c1aa3bae4e3aa4598127f8de923029b0e78f23c9..c052ceabfc7713b2ddb5fe99eefd3feec98a16f0 100644 (file)
@@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
 };
 
 #define SQ_MAX_CALL_DEPTH 0x00000020
-struct r600_cf_callstack {
-       unsigned                        fc_sp_before_entry;
-       int                             sub_desc_index;
-       int                             current;
-       int                             max;
-};
 
 #define AR_HANDLE_NORMAL 0
 #define AR_HANDLE_RV6XX 1 /* except RV670 */
 
+struct r600_stack_info {
+       /* current level of non-WQM PUSH operations
+        * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
+       int push;
+       /* current level of WQM PUSH operations
+        * (PUSH, PUSH_ELSE, PUSH_WQM) */
+       int push_wqm;
+       /* current loop level */
+       int loop;
+
+       /* required depth */
+       int max_entries;
+       /* subentries per entry */
+       int entry_size;
+};
 
 struct r600_bytecode {
        enum chip_class                 chip_class;
@@ -199,8 +208,7 @@ struct r600_bytecode {
        uint32_t                        *bytecode;
        uint32_t                        fc_sp;
        struct r600_cf_stack_entry      fc_stack[32];
-       unsigned                        call_sp;
-       struct r600_cf_callstack        callstack[SQ_MAX_CALL_DEPTH];
+       struct r600_stack_info          stack;
        unsigned        ar_loaded;
        unsigned        ar_reg;
        unsigned        ar_chan;
index e74ed54443d7053456dbc0c365c99a455aa3534b..82885d1370e84497a4eed551f9a9eb45d732b347 100644 (file)
@@ -245,7 +245,7 @@ struct r600_shader_tgsi_instruction {
 
 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
-static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
 static int tgsi_else(struct r600_shader_ctx *ctx);
 static int tgsi_endif(struct r600_shader_ctx *ctx);
@@ -419,7 +419,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
 {
        r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
        fc_pushlevel(ctx, FC_IF);
-       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+       callstack_push(ctx, FC_PUSH_VPM);
 }
 
 static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
@@ -5551,63 +5551,107 @@ static int pops(struct r600_shader_ctx *ctx, int pops)
        return 0;
 }
 
-static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
+static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
+                                              unsigned reason)
+{
+       struct r600_stack_info *stack = &ctx->bc->stack;
+       unsigned elements, entries;
+
+       unsigned entry_size = stack->entry_size;
+
+       elements = (stack->loop + stack->push_wqm ) * entry_size;
+       elements += stack->push;
+
+       switch (ctx->bc->chip_class) {
+       case R600:
+       case R700:
+               /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
+                * the stack must be reserved to hold the current active/continue
+                * masks */
+               if (reason == FC_PUSH_VPM) {
+                       elements += 2;
+               }
+               break;
+
+       case CAYMAN:
+               /* r9xx: any stack operation on empty stack consumes 2 additional
+                * elements */
+               elements += 2;
+
+               /* fallthrough */
+               /* FIXME: do the two elements added above cover the cases for the
+                * r8xx+ below? */
+
+       case EVERGREEN:
+               /* r8xx+: 2 extra elements are not always required, but one extra
+                * element must be added for each of the following cases:
+                * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
+                *    stack usage.
+                *    (Currently we don't use ALU_ELSE_AFTER.)
+                * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
+                *    PUSH instruction executed.
+                *
+                *    NOTE: it seems we also need to reserve additional element in some
+                *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
+                *    then STACK_SIZE should be 2 instead of 1 */
+               if (reason == FC_PUSH_VPM) {
+                       elements += 1;
+               }
+               break;
+
+       default:
+               assert(0);
+               break;
+       }
+
+       /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
+        * for all chips, so we use 4 in the final formula, not the real entry_size
+        * for the chip */
+       entry_size = 4;
+
+       entries = (elements + (entry_size - 1)) / entry_size;
+
+       if (entries > stack->max_entries)
+               stack->max_entries = entries;
+}
+
+static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
 {
        switch(reason) {
        case FC_PUSH_VPM:
-               ctx->bc->callstack[ctx->bc->call_sp].current--;
+               --ctx->bc->stack.push;
+               assert(ctx->bc->stack.push >= 0);
                break;
        case FC_PUSH_WQM:
+               --ctx->bc->stack.push_wqm;
+               assert(ctx->bc->stack.push_wqm >= 0);
+               break;
        case FC_LOOP:
-               ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
+               --ctx->bc->stack.loop;
+               assert(ctx->bc->stack.loop >= 0);
                break;
-       case FC_REP:
-               /* TOODO : for 16 vp asic should -= 2; */
-               ctx->bc->callstack[ctx->bc->call_sp].current --;
+       default:
+               assert(0);
                break;
        }
 }
 
-static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
 {
-       if (check_max_only) {
-               int diff;
-               switch (reason) {
-               case FC_PUSH_VPM:
-                       diff = 1;
-                       break;
-               case FC_PUSH_WQM:
-                       diff = 4;
-                       break;
-               default:
-                       assert(0);
-                       diff = 0;
-               }
-               if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
-                   ctx->bc->callstack[ctx->bc->call_sp].max) {
-                       ctx->bc->callstack[ctx->bc->call_sp].max =
-                               ctx->bc->callstack[ctx->bc->call_sp].current + diff;
-               }
-               return;
-       }
        switch (reason) {
        case FC_PUSH_VPM:
-               ctx->bc->callstack[ctx->bc->call_sp].current++;
+               ++ctx->bc->stack.push;
                break;
        case FC_PUSH_WQM:
+               ++ctx->bc->stack.push_wqm;
        case FC_LOOP:
-               ctx->bc->callstack[ctx->bc->call_sp].current += 4;
-               break;
-       case FC_REP:
-               ctx->bc->callstack[ctx->bc->call_sp].current++;
+               ++ctx->bc->stack.loop;
                break;
+       default:
+               assert(0);
        }
 
-       if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
-           ctx->bc->callstack[ctx->bc->call_sp].max) {
-               ctx->bc->callstack[ctx->bc->call_sp].max =
-                       ctx->bc->callstack[ctx->bc->call_sp].current;
-       }
+       callstack_update_max_depth(ctx, reason);
 }
 
 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
@@ -5694,7 +5738,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
 
        fc_pushlevel(ctx, FC_IF);
 
-       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+       callstack_push(ctx, FC_PUSH_VPM);
        return 0;
 }
 
@@ -5724,7 +5768,7 @@ static int tgsi_endif(struct r600_shader_ctx *ctx)
        }
        fc_poplevel(ctx);
 
-       callstack_decrease_current(ctx, FC_PUSH_VPM);
+       callstack_pop(ctx, FC_PUSH_VPM);
        return 0;
 }
 
@@ -5737,7 +5781,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
        fc_pushlevel(ctx, FC_LOOP);
 
        /* check stack depth */
-       callstack_check_depth(ctx, FC_LOOP, 0);
+       callstack_push(ctx, FC_LOOP);
        return 0;
 }
 
@@ -5766,7 +5810,7 @@ static int tgsi_endloop(struct r600_shader_ctx *ctx)
        }
        /* XXX add LOOPRET support */
        fc_poplevel(ctx);
-       callstack_decrease_current(ctx, FC_LOOP);
+       callstack_pop(ctx, FC_LOOP);
        return 0;
 }
 
@@ -5789,7 +5833,6 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
 
        fc_set_mid(ctx, fscp);
 
-       callstack_check_depth(ctx, FC_PUSH_VPM, 1);
        return 0;
 }