iris: implement scratch space!
authorKenneth Graunke <kenneth@whitecape.org>
Thu, 8 Nov 2018 06:05:14 +0000 (22:05 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Thu, 21 Feb 2019 18:26:09 +0000 (10:26 -0800)
we borrow the approach from anv rather than i965, as it works better
with pre-baked state that needs to contain scratch BO addresses

fixes a bunch of varying packing tests

src/gallium/drivers/iris/iris_context.h
src/gallium/drivers/iris/iris_program.c
src/gallium/drivers/iris/iris_program_cache.c
src/gallium/drivers/iris/iris_state.c

index 3f68cb3552ca56673e71440ab34f557c30682175..54aa1a509e864f3bcb553a80a9e70f764193adde 100644 (file)
@@ -328,7 +328,7 @@ struct iris_vtable {
                                  uint64_t imm);
 
    unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
-   void (*store_derived_program_state)(const struct gen_device_info *devinfo,
+   void (*store_derived_program_state)(struct iris_context *ice,
                                        enum iris_program_cache_id cache_id,
                                        struct iris_compiled_shader *shader);
    uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
@@ -394,6 +394,14 @@ struct iris_context {
       struct hash_table *cache;
 
       unsigned urb_size;
+
+      /**
+       * Scratch buffers for various sizes and stages.
+       *
+       * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding,
+       * and shader stage.
+       */
+      struct iris_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES];
    } shaders;
 
    struct {
@@ -552,7 +560,9 @@ const struct shader_info *iris_get_shader_info(const struct iris_context *ice,
                                                gl_shader_stage stage);
 unsigned iris_get_shader_num_ubos(const struct iris_context *ice,
                                   gl_shader_stage stage);
-
+uint32_t iris_get_scratch_space(struct iris_context *ice,
+                                unsigned per_thread_scratch,
+                                gl_shader_stage stage);
 
 /* iris_program_cache.c */
 
index fc874c4c08f10a00f44a29b4d9a125de0b52a828..626098c6a7aca4fe4401c782db7b500520ae6d2a 100644 (file)
@@ -1072,6 +1072,56 @@ iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
       dst[8 * t] = t;
 }
 
+/**
+ * Allocate scratch BOs as needed for the given per-thread size and stage.
+ *
+ * Returns the 32-bit "Scratch Space Base Pointer" value.
+ */
+uint32_t
+iris_get_scratch_space(struct iris_context *ice,
+                       unsigned per_thread_scratch,
+                       gl_shader_stage stage)
+{
+   struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+   const struct gen_device_info *devinfo = &screen->devinfo;
+
+   unsigned encoded_size = ffs(per_thread_scratch) - 11;
+   assert(encoded_size < (1 << 16));
+
+   struct iris_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
+
+   /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
+    *
+    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+    *  allocate scratch space enough so that each slice has 4 slices
+    *  allowed."
+    *
+    * According to the other driver team, this applies to compute shaders
+    * as well.  This is not currently documented at all.
+    */
+   unsigned subslice_total = 4 * devinfo->num_slices;
+   assert(subslice_total >= screen->subslice_total);
+
+   if (!*bop) {
+      unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
+      uint32_t max_threads[] = {
+         [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
+         [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
+         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
+         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
+         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
+         [MESA_SHADER_COMPUTE]   = scratch_ids_per_subslice * subslice_total,
+      };
+
+      uint32_t size = per_thread_scratch * max_threads[stage];
+
+      *bop = iris_bo_alloc(bufmgr, "scratch", size, IRIS_MEMZONE_SHADER);
+   }
+
+   return (*bop)->gtt_offset;
+}
+
 void
 iris_init_program_functions(struct pipe_context *ctx)
 {
index f5c6fc267945b35d6b79457b45716cc9bf923e88..9f500097d52ca1998067e591909a8f0a15fc498b 100644 (file)
@@ -241,8 +241,6 @@ iris_upload_shader(struct iris_context *ice,
                    struct brw_stage_prog_data *prog_data,
                    uint32_t *streamout)
 {
-   struct iris_screen *screen = (void *) ice->ctx.screen;
-   struct gen_device_info *devinfo = &screen->devinfo;
    struct hash_table *cache = ice->shaders.cache;
    struct iris_compiled_shader *shader =
       rzalloc_size(cache, sizeof(struct iris_compiled_shader) +
@@ -277,7 +275,7 @@ iris_upload_shader(struct iris_context *ice,
    ralloc_steal(shader, shader->streamout);
 
    /* Store the 3DSTATE shader packets and other derived state. */
-   ice->vtbl.store_derived_program_state(devinfo, cache_id, shader);
+   ice->vtbl.store_derived_program_state(ice, cache_id, shader);
 
    struct keybox *keybox = make_keybox(cache, cache_id, key, key_size);
    _mesa_hash_table_insert(ice->shaders.cache, keybox, shader);
index 3c668cac980e67194723a5b3f6ea0989b541af0a..3d2af86c5443c41668e62f09d803c720b12ddd30 100644 (file)
@@ -2981,8 +2981,6 @@ iris_populate_cs_key(const struct iris_context *ice,
    // XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS
    pkt.SamplerCount =                                                     \
       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
-   pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 :        \
-      ffs(stage_state->per_thread_scratch) - 11;                          \
 
 #endif
 
@@ -2997,7 +2995,7 @@ KSP(const struct iris_compiled_shader *shader)
 // prefetching of binding tables in A0 and B0 steppings.  XXX: Revisit
 // this WA on C0 stepping.
 
-#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix)                          \
+#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
    pkt.KernelStartPointer = KSP(shader);                                  \
    pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 :                       \
       prog_data->binding_table.size_bytes / 4;                            \
@@ -3009,20 +3007,28 @@ KSP(const struct iris_compiled_shader *shader)
    pkt.prefix##URBEntryReadOffset = 0;                                    \
                                                                           \
    pkt.StatisticsEnable = true;                                           \
-   pkt.Enable           = true;
+   pkt.Enable           = true;                                           \
+                                                                          \
+   if (prog_data->total_scratch) {                                        \
+      uint32_t scratch_addr =                                             \
+         iris_get_scratch_space(ice, prog_data->total_scratch, stage);    \
+      pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;     \
+      pkt.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);            \
+   }
 
 /**
  * Encode most of 3DSTATE_VS based on the compiled shader.
  */
 static void
-iris_store_vs_state(const struct gen_device_info *devinfo,
+iris_store_vs_state(struct iris_context *ice,
+                    const struct gen_device_info *devinfo,
                     struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
 
    iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
-      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
+      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
       vs.SIMD8DispatchEnable = true;
       vs.UserClipDistanceCullTestEnableBitmask =
@@ -3034,7 +3040,8 @@ iris_store_vs_state(const struct gen_device_info *devinfo,
  * Encode most of 3DSTATE_HS based on the compiled shader.
  */
 static void
-iris_store_tcs_state(const struct gen_device_info *devinfo,
+iris_store_tcs_state(struct iris_context *ice,
+                     const struct gen_device_info *devinfo,
                      struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
@@ -3042,7 +3049,7 @@ iris_store_tcs_state(const struct gen_device_info *devinfo,
    struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
 
    iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
-      INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
+      INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
 
       hs.InstanceCount = tcs_prog_data->instances - 1;
       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
@@ -3054,7 +3061,8 @@ iris_store_tcs_state(const struct gen_device_info *devinfo,
  * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
  */
 static void
-iris_store_tes_state(const struct gen_device_info *devinfo,
+iris_store_tes_state(struct iris_context *ice,
+                     const struct gen_device_info *devinfo,
                      struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
@@ -3074,7 +3082,7 @@ iris_store_tes_state(const struct gen_device_info *devinfo,
    }
 
    iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
-      INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
+      INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
 
       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
@@ -3091,7 +3099,8 @@ iris_store_tes_state(const struct gen_device_info *devinfo,
  * Encode most of 3DSTATE_GS based on the compiled shader.
  */
 static void
-iris_store_gs_state(const struct gen_device_info *devinfo,
+iris_store_gs_state(struct iris_context *ice,
+                    const struct gen_device_info *devinfo,
                     struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
@@ -3099,7 +3108,7 @@ iris_store_gs_state(const struct gen_device_info *devinfo,
    struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
 
    iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
-      INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
+      INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
 
       gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
       gs.OutputTopology = gs_prog_data->output_topology;
@@ -3138,7 +3147,8 @@ iris_store_gs_state(const struct gen_device_info *devinfo,
  * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
  */
 static void
-iris_store_fs_state(const struct gen_device_info *devinfo,
+iris_store_fs_state(struct iris_context *ice,
+                    const struct gen_device_info *devinfo,
                     struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
@@ -3193,6 +3203,14 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
          KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
       ps.KernelStartPointer2 =
          KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+
+      if (prog_data->total_scratch) {
+         uint32_t scratch_addr =
+            iris_get_scratch_space(ice, prog_data->total_scratch,
+                                   MESA_SHADER_FRAGMENT);
+         ps.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
+         ps.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);
+      }
    }
 
    iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
@@ -3226,7 +3244,8 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
  * This must match the data written by the iris_store_xs_state() functions.
  */
 static void
-iris_store_cs_state(const struct gen_device_info *devinfo,
+iris_store_cs_state(struct iris_context *ice,
+                    const struct gen_device_info *devinfo,
                     struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
@@ -3271,28 +3290,31 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
  * get most of the state packet without having to reconstruct it.
  */
 static void
-iris_store_derived_program_state(const struct gen_device_info *devinfo,
+iris_store_derived_program_state(struct iris_context *ice,
                                  enum iris_program_cache_id cache_id,
                                  struct iris_compiled_shader *shader)
 {
+   struct iris_screen *screen = (void *) ice->ctx.screen;
+   const struct gen_device_info *devinfo = &screen->devinfo;
+
    switch (cache_id) {
    case IRIS_CACHE_VS:
-      iris_store_vs_state(devinfo, shader);
+      iris_store_vs_state(ice, devinfo, shader);
       break;
    case IRIS_CACHE_TCS:
-      iris_store_tcs_state(devinfo, shader);
+      iris_store_tcs_state(ice, devinfo, shader);
       break;
    case IRIS_CACHE_TES:
-      iris_store_tes_state(devinfo, shader);
+      iris_store_tes_state(ice, devinfo, shader);
       break;
    case IRIS_CACHE_GS:
-      iris_store_gs_state(devinfo, shader);
+      iris_store_gs_state(ice, devinfo, shader);
       break;
    case IRIS_CACHE_FS:
-      iris_store_fs_state(devinfo, shader);
+      iris_store_fs_state(ice, devinfo, shader);
       break;
    case IRIS_CACHE_CS:
-      iris_store_cs_state(devinfo, shader);
+      iris_store_cs_state(ice, devinfo, shader);
    case IRIS_CACHE_BLORP:
       break;
    default:
@@ -4401,12 +4423,11 @@ iris_upload_compute_state(struct iris_context *ice,
 
       iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
          if (prog_data->total_scratch) {
-            /* Per Thread Scratch Space is in the range [0, 11] where
-             * 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
-             */
-            // XXX: vfe.ScratchSpaceBasePointer
-            //vfe.PerThreadScratchSpace =
-               //ffs(stage_state->per_thread_scratch) - 11;
+            uint32_t scratch_addr =
+               iris_get_scratch_space(ice, prog_data->total_scratch,
+                                      MESA_SHADER_COMPUTE);
+            vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
+            vfe.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);
          }
 
          vfe.MaximumNumberofThreads =