r600g: don't reserve more stack space than required v5

author Vadim Girlin <vadimgirlin@gmail.com>

Tue, 2 Apr 2013 15:33:40 +0000 (19:33 +0400)

committer Vadim Girlin <vadimgirlin@gmail.com>

Tue, 2 Apr 2013 15:34:14 +0000 (19:34 +0400)
author Vadim Girlin <vadimgirlin@gmail.com>
Tue, 2 Apr 2013 15:33:40 +0000 (19:33 +0400)
committer Vadim Girlin <vadimgirlin@gmail.com>
Tue, 2 Apr 2013 15:34:14 +0000 (19:34 +0400)
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c

index 65c705d0aa820f1562c8d00b5d24812d2236a934..c88b48dc96b953c7d523e611ca13cd906ebeb2cc 100644 (file)
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -87,6 +87,40 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
         return tex;
  }
  
+static unsigned stack_entry_size(enum radeon_family chip) {
+       /* Wavefront size:
+        *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
+        *       Aruba/Sumo/Sumo2/redwood/juniper
+        *   32: R630/R730/R710/Palm/Cedar
+        *   16: R610/Rs780
+        *
+        * Stack row size:
+        *      Wavefront Size                        16  32  48  64
+        *      Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
+        *      Columns per Row (R9xx+)                8   4   4   4 */
+
+       switch (chip) {
+       /* FIXME: are some chips missing here? */
+       /* wavefront size 16 */
+       case CHIP_RV610:
+       case CHIP_RS780:
+       case CHIP_RV620:
+       case CHIP_RS880:
+       /* wavefront size 32 */
+       case CHIP_RV630:
+       case CHIP_RV635:
+       case CHIP_RV730:
+       case CHIP_RV710:
+       case CHIP_PALM:
+       case CHIP_CEDAR:
+               return 8;
+
+       /* wavefront size 64 */
+       default:
+               return 4;
+       }
+}
+
  void r600_bytecode_init(struct r600_bytecode *bc,
                         enum chip_class chip_class,
                         enum radeon_family family,
@@ -104,6 +138,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
         LIST_INITHEAD(&bc->cf);
         bc->chip_class = chip_class;
         bc->msaa_texture_mode = msaa_texture_mode;
+       bc->stack.entry_size = stack_entry_size(family);
  }
  
  int r600_bytecode_add_cf(struct r600_bytecode *bc)
@@ -1522,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
         unsigned addr;
         int i, r;
  
-       if (bc->callstack[0].max > 0)
-               bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
+       bc->nstack = bc->stack.max_entries;
+
         if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
                 bc->nstack = 1;
         }
@@ -1824,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
                 chip = '6';
                 break;
         }
-       fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n",
-               bc->ndw, bc->ngpr);
+       fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
+               bc->ndw, bc->ngpr, bc->nstack);
         fprintf(stderr, "shader %d -- %c\n", index++, chip);
  
         LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h

index c1aa3bae4e3aa4598127f8de923029b0e78f23c9..c052ceabfc7713b2ddb5fe99eefd3feec98a16f0 100644 (file)
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
  };
  
  #define SQ_MAX_CALL_DEPTH 0x00000020
-struct r600_cf_callstack {
-       unsigned                        fc_sp_before_entry;
-       int                             sub_desc_index;
-       int                             current;
-       int                             max;
-};
  
  #define AR_HANDLE_NORMAL 0
  #define AR_HANDLE_RV6XX 1 /* except RV670 */
  
+struct r600_stack_info {
+       /* current level of non-WQM PUSH operations
+        * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
+       int push;
+       /* current level of WQM PUSH operations
+        * (PUSH, PUSH_ELSE, PUSH_WQM) */
+       int push_wqm;
+       /* current loop level */
+       int loop;
+
+       /* required depth */
+       int max_entries;
+       /* subentries per entry */
+       int entry_size;
+};
  
  struct r600_bytecode {
         enum chip_class                 chip_class;
@@ -199,8 +208,7 @@ struct r600_bytecode {
         uint32_t                        *bytecode;
         uint32_t                        fc_sp;
         struct r600_cf_stack_entry      fc_stack[32];
-       unsigned                        call_sp;
-       struct r600_cf_callstack        callstack[SQ_MAX_CALL_DEPTH];
+       struct r600_stack_info          stack;
         unsigned        ar_loaded;
         unsigned        ar_reg;
         unsigned        ar_chan;
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c

index e74ed54443d7053456dbc0c365c99a455aa3534b..82885d1370e84497a4eed551f9a9eb45d732b347 100644 (file)
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -245,7 +245,7 @@ struct r600_shader_tgsi_instruction {
  
  static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
  static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
-static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
  static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
  static int tgsi_else(struct r600_shader_ctx *ctx);
  static int tgsi_endif(struct r600_shader_ctx *ctx);
@@ -419,7 +419,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
  {
         r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
         fc_pushlevel(ctx, FC_IF);
-       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+       callstack_push(ctx, FC_PUSH_VPM);
  }
  
  static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
@@ -5551,63 +5551,107 @@ static int pops(struct r600_shader_ctx *ctx, int pops)
         return 0;
  }
  
-static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
+static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
+                                              unsigned reason)
+{
+       struct r600_stack_info *stack = &ctx->bc->stack;
+       unsigned elements, entries;
+
+       unsigned entry_size = stack->entry_size;
+
+       elements = (stack->loop + stack->push_wqm ) * entry_size;
+       elements += stack->push;
+
+       switch (ctx->bc->chip_class) {
+       case R600:
+       case R700:
+               /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
+                * the stack must be reserved to hold the current active/continue
+                * masks */
+               if (reason == FC_PUSH_VPM) {
+                       elements += 2;
+               }
+               break;
+
+       case CAYMAN:
+               /* r9xx: any stack operation on empty stack consumes 2 additional
+                * elements */
+               elements += 2;
+
+               /* fallthrough */
+               /* FIXME: do the two elements added above cover the cases for the
+                * r8xx+ below? */
+
+       case EVERGREEN:
+               /* r8xx+: 2 extra elements are not always required, but one extra
+                * element must be added for each of the following cases:
+                * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
+                *    stack usage.
+                *    (Currently we don't use ALU_ELSE_AFTER.)
+                * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
+                *    PUSH instruction executed.
+                *
+                *    NOTE: it seems we also need to reserve additional element in some
+                *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
+                *    then STACK_SIZE should be 2 instead of 1 */
+               if (reason == FC_PUSH_VPM) {
+                       elements += 1;
+               }
+               break;
+
+       default:
+               assert(0);
+               break;
+       }
+
+       /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
+        * for all chips, so we use 4 in the final formula, not the real entry_size
+        * for the chip */
+       entry_size = 4;
+
+       entries = (elements + (entry_size - 1)) / entry_size;
+
+       if (entries > stack->max_entries)
+               stack->max_entries = entries;
+}
+
+static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
  {
         switch(reason) {
         case FC_PUSH_VPM:
-               ctx->bc->callstack[ctx->bc->call_sp].current--;
+               --ctx->bc->stack.push;
+               assert(ctx->bc->stack.push >= 0);
                 break;
         case FC_PUSH_WQM:
+               --ctx->bc->stack.push_wqm;
+               assert(ctx->bc->stack.push_wqm >= 0);
+               break;
         case FC_LOOP:
-               ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
+               --ctx->bc->stack.loop;
+               assert(ctx->bc->stack.loop >= 0);
                 break;
-       case FC_REP:
-               /* TOODO : for 16 vp asic should -= 2; */
-               ctx->bc->callstack[ctx->bc->call_sp].current --;
+       default:
+               assert(0);
                 break;
         }
  }
  
-static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
  {
-       if (check_max_only) {
-               int diff;
-               switch (reason) {
-               case FC_PUSH_VPM:
-                       diff = 1;
-                       break;
-               case FC_PUSH_WQM:
-                       diff = 4;
-                       break;
-               default:
-                       assert(0);
-                       diff = 0;
-               }
-               if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
-                   ctx->bc->callstack[ctx->bc->call_sp].max) {
-                       ctx->bc->callstack[ctx->bc->call_sp].max =
-                               ctx->bc->callstack[ctx->bc->call_sp].current + diff;
-               }
-               return;
-       }
         switch (reason) {
         case FC_PUSH_VPM:
-               ctx->bc->callstack[ctx->bc->call_sp].current++;
+               ++ctx->bc->stack.push;
                 break;
         case FC_PUSH_WQM:
+               ++ctx->bc->stack.push_wqm;
         case FC_LOOP:
-               ctx->bc->callstack[ctx->bc->call_sp].current += 4;
-               break;
-       case FC_REP:
-               ctx->bc->callstack[ctx->bc->call_sp].current++;
+               ++ctx->bc->stack.loop;
                 break;
+       default:
+               assert(0);
         }
  
-       if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
-           ctx->bc->callstack[ctx->bc->call_sp].max) {
-               ctx->bc->callstack[ctx->bc->call_sp].max =
-                       ctx->bc->callstack[ctx->bc->call_sp].current;
-       }
+       callstack_update_max_depth(ctx, reason);
  }
  
  static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
@@ -5694,7 +5738,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
  
         fc_pushlevel(ctx, FC_IF);
  
-       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+       callstack_push(ctx, FC_PUSH_VPM);
         return 0;
  }
  
@@ -5724,7 +5768,7 @@ static int tgsi_endif(struct r600_shader_ctx *ctx)
         }
         fc_poplevel(ctx);
  
-       callstack_decrease_current(ctx, FC_PUSH_VPM);
+       callstack_pop(ctx, FC_PUSH_VPM);
         return 0;
  }
  
@@ -5737,7 +5781,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
         fc_pushlevel(ctx, FC_LOOP);
  
         /* check stack depth */
-       callstack_check_depth(ctx, FC_LOOP, 0);
+       callstack_push(ctx, FC_LOOP);
         return 0;
  }
  
@@ -5766,7 +5810,7 @@ static int tgsi_endloop(struct r600_shader_ctx *ctx)
         }
         /* XXX add LOOPRET support */
         fc_poplevel(ctx);
-       callstack_decrease_current(ctx, FC_LOOP);
+       callstack_pop(ctx, FC_LOOP);
         return 0;
  }
  
@@ -5789,7 +5833,6 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
  
         fc_set_mid(ctx, fscp);
  
-       callstack_check_depth(ctx, FC_PUSH_VPM, 1);
         return 0;
  }
author	Vadim Girlin <vadimgirlin@gmail.com>
	Tue, 2 Apr 2013 15:33:40 +0000 (19:33 +0400)
committer	Vadim Girlin <vadimgirlin@gmail.com>
	Tue, 2 Apr 2013 15:34:14 +0000 (19:34 +0400)
src/gallium/drivers/r600/r600_asm.c		patch \| blob \| history
src/gallium/drivers/r600/r600_asm.h		patch \| blob \| history
src/gallium/drivers/r600/r600_shader.c		patch \| blob \| history