i965: Calculate thread_count in brw_alloc_stage_scratch
authorJordan Justen <jordan.l.justen@intel.com>
Tue, 31 Oct 2017 07:34:32 +0000 (00:34 -0700)
committerJordan Justen <jordan.l.justen@intel.com>
Wed, 1 Nov 2017 06:36:54 +0000 (23:36 -0700)
Previously, thread_count was sent in from the stage after some stage
specific calculations. Those stage specific calculations were moved
into brw_alloc_stage_scratch, which will allow the shader cache to
also use the same calculations.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_cs.c
src/mesa/drivers/dri/i965/brw_gs.c
src/mesa/drivers/dri/i965/brw_program.c
src/mesa/drivers/dri/i965/brw_tcs.c
src/mesa/drivers/dri/i965/brw_tes.c
src/mesa/drivers/dri/i965/brw_vs.c
src/mesa/drivers/dri/i965/brw_wm.c

index 0102f154248564a6a09be6708a10df940b8d22f9..3bee3e99ed27380a5968af975b5ab520715a5ead 100644 (file)
@@ -1345,8 +1345,7 @@ void brw_get_scratch_bo(struct brw_context *brw,
                        struct brw_bo **scratch_bo, int size);
 void brw_alloc_stage_scratch(struct brw_context *brw,
                              struct brw_stage_state *stage_state,
-                             unsigned per_thread_size,
-                             unsigned thread_count);
+                             unsigned per_thread_size);
 void brw_init_shader_time(struct brw_context *brw);
 int brw_get_shader_time_index(struct brw_context *brw,
                               struct gl_program *prog,
index 1a0e9f62c631e0556243f565780692b5a71d32f6..1d34a8a79d18cd18b58f218decd0c25a05c93788 100644 (file)
@@ -114,29 +114,7 @@ brw_codegen_cs_prog(struct brw_context *brw,
       }
    }
 
-   const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
-
-   /* WaCSScratchSize:hsw
-    *
-    * Haswell's scratch space address calculation appears to be sparse
-    * rather than tightly packed.  The Thread ID has bits indicating
-    * which subslice, EU within a subslice, and thread within an EU
-    * it is.  There's a maximum of two slices and two subslices, so these
-    * can be stored with a single bit.  Even though there are only 10 EUs
-    * per subslice, this is stored in 4 bits, so there's an effective
-    * maximum value of 16 EUs.  Similarly, although there are only 7
-    * threads per EU, this is stored in a 3 bit number, giving an effective
-    * maximum value of 8 threads per EU.
-    *
-    * This means that we need to use 16 * 8 instead of 10 * 7 for the
-    * number of threads per subslice.
-    */
-   const unsigned scratch_ids_per_subslice =
-      devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
-
-   brw_alloc_stage_scratch(brw, &brw->cs.base,
-                           prog_data.base.total_scratch,
-                           scratch_ids_per_subslice * subslices);
+   brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch);
 
    /* The param and pull_param arrays will be freed by the shader cache. */
    ralloc_steal(NULL, prog_data.base.param);
index e6e757ce686a1faefdc67521857eb185789492af..19eab2f5332757067a5ae905ce48e5f753bed4a9 100644 (file)
@@ -138,8 +138,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
 
    /* Scratch space is used for register spilling */
    brw_alloc_stage_scratch(brw, stage_state,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_gs_threads);
+                           prog_data.base.base.total_scratch);
 
    /* The param and pull_param arrays will be freed by the shader cache. */
    ralloc_steal(NULL, prog_data.base.base.param);
index 02d095b06f6651ddfee88bc51b330162601d7df1..7607bc388407304ce6f703d545890964c2aa7886 100644 (file)
@@ -328,19 +328,65 @@ brw_get_scratch_bo(struct brw_context *brw,
 void
 brw_alloc_stage_scratch(struct brw_context *brw,
                         struct brw_stage_state *stage_state,
-                        unsigned per_thread_size,
-                        unsigned thread_count)
+                        unsigned per_thread_size)
 {
-   if (stage_state->per_thread_scratch < per_thread_size) {
-      stage_state->per_thread_scratch = per_thread_size;
+   if (stage_state->per_thread_scratch >= per_thread_size)
+      return;
+
+   stage_state->per_thread_scratch = per_thread_size;
 
-      if (stage_state->scratch_bo)
-         brw_bo_unreference(stage_state->scratch_bo);
+   if (stage_state->scratch_bo)
+      brw_bo_unreference(stage_state->scratch_bo);
+
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   unsigned thread_count;
+   switch(stage_state->stage) {
+   case MESA_SHADER_VERTEX:
+      thread_count = devinfo->max_vs_threads;
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      thread_count = devinfo->max_tcs_threads;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      thread_count = devinfo->max_tes_threads;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      thread_count = devinfo->max_gs_threads;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      thread_count = devinfo->max_wm_threads;
+      break;
+   case MESA_SHADER_COMPUTE: {
+      const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
+
+      /* WaCSScratchSize:hsw
+       *
+       * Haswell's scratch space address calculation appears to be sparse
+       * rather than tightly packed.  The Thread ID has bits indicating
+       * which subslice, EU within a subslice, and thread within an EU
+       * it is.  There's a maximum of two slices and two subslices, so these
+       * can be stored with a single bit.  Even though there are only 10 EUs
+       * per subslice, this is stored in 4 bits, so there's an effective
+       * maximum value of 16 EUs.  Similarly, although there are only 7
+       * threads per EU, this is stored in a 3 bit number, giving an effective
+       * maximum value of 8 threads per EU.
+       *
+       * This means that we need to use 16 * 8 instead of 10 * 7 for the
+       * number of threads per subslice.
+       */
+      const unsigned scratch_ids_per_subslice =
+         devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
 
-      stage_state->scratch_bo =
-         brw_bo_alloc(brw->bufmgr, "shader scratch space",
-                      per_thread_size * thread_count, 4096);
+      thread_count = scratch_ids_per_subslice * subslices;
+      break;
    }
+   default:
+      unreachable("Unsupported stage!");
+   }
+
+   stage_state->scratch_bo =
+      brw_bo_alloc(brw->bufmgr, "shader scratch space",
+                   per_thread_size * thread_count, 4096);
 }
 
 void brwInitFragProgFuncs( struct dd_function_table *functions )
index 5ac728ee68f90d139adfc2319a6116db49e082c8..685336eb973f7c1fdaffbd868af5706fea9b5d20 100644 (file)
@@ -259,8 +259,7 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp,
 
    /* Scratch space is used for register spilling */
    brw_alloc_stage_scratch(brw, stage_state,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_tcs_threads);
+                           prog_data.base.base.total_scratch);
 
    /* The param and pull_param arrays will be freed by the shader cache. */
    ralloc_steal(NULL, prog_data.base.base.param);
index 21f44ae74afde1f88d0741691742fda6a9c464f3..225d2401c7e10cb45641ab6dc0b3e4bfecef8914 100644 (file)
@@ -129,8 +129,7 @@ brw_codegen_tes_prog(struct brw_context *brw,
 
    /* Scratch space is used for register spilling */
    brw_alloc_stage_scratch(brw, stage_state,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_tes_threads);
+                           prog_data.base.base.total_scratch);
 
    /* The param and pull_param arrays will be freed by the shader cache. */
    ralloc_steal(NULL, prog_data.base.base.param);
index d308bb83332b6ca4f2be804f5eddc7c4adc85354..de0ba79f3577589b3a5ea753dcfcba2852a4f0ef 100644 (file)
@@ -248,8 +248,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    /* Scratch space is used for register spilling */
    brw_alloc_stage_scratch(brw, &brw->vs.base,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_vs_threads);
+                           prog_data.base.base.total_scratch);
 
    /* The param and pull_param arrays will be freed by the shader cache. */
    ralloc_steal(NULL, prog_data.base.base.param);
index 4144cd11ee43f641ef33cb9702e278b6e8f74ccf..34a3a1e5c16625447785afdd79da7243730b8e1e 100644 (file)
@@ -209,9 +209,7 @@ brw_codegen_wm_prog(struct brw_context *brw,
       }
    }
 
-   brw_alloc_stage_scratch(brw, &brw->wm.base,
-                           prog_data.base.total_scratch,
-                           devinfo->max_wm_threads);
+   brw_alloc_stage_scratch(brw, &brw->wm.base, prog_data.base.total_scratch);
 
    if (unlikely((INTEL_DEBUG & DEBUG_WM) && fp->program.is_arb_asm))
       fprintf(stderr, "\n");