i965: Implement ARB_compute_variable_group_size

author Plamena Manolova <plamena.n.manolova@gmail.com>

Mon, 12 Nov 2018 14:29:51 +0000 (16:29 +0200)

committer Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>

Fri, 10 Apr 2020 02:23:12 +0000 (19:23 -0700)
author Plamena Manolova <plamena.n.manolova@gmail.com>
Mon, 12 Nov 2018 14:29:51 +0000 (16:29 +0200)
committer Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Fri, 10 Apr 2020 02:23:12 +0000 (19:23 -0700)
diff --git a/docs/features.txt b/docs/features.txt

index 4db525b9da861ad27f4f642dab8f08c5d8eabdbc..6394245a6df779dfcda9b0ceddc29434e47d87c0 100644 (file)
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -299,7 +299,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
  
    GL_ARB_bindless_texture                               DONE (nvc0, radeonsi)
    GL_ARB_cl_event                                       not started
-  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
+  GL_ARB_compute_variable_group_size                    DONE (i965/gen7+, nvc0, radeonsi)
    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+, radeonsi, virgl)
    GL_ARB_fragment_shader_interlock                      DONE (i965)
    GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt

index e31b7e6838563f670f90355ca788c29e2ae71699..416313997370c3460d9f99dc34e63e860134e72a 100644 (file)
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -1,3 +1,4 @@
+GL_ARB_compute_variable_group_size on i965.
  GL_EXT_texture_shadow_lod on radeonsi.
  GL_NV_copy_image on all gallium drivers.
  VK_KHR_shader_non_semantic_info on Intel, RADV.
diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c

index 09df08d7bc011c350fdbcef64e63df112bb7bd30..852db6dd64b192a6676849fb5da6e2f159b47c95 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -101,6 +101,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) {
  
     brw->compute.num_work_groups_bo = NULL;
     brw->compute.num_work_groups = num_groups;
+   brw->compute.group_size = NULL;
     ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
  
     brw_dispatch_compute_common(ctx);
@@ -120,6 +121,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect)
     brw->compute.num_work_groups_bo = bo;
     brw->compute.num_work_groups_offset = indirect;
     brw->compute.num_work_groups = indirect_group_counts;
+   brw->compute.group_size = NULL;
+   ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
+
+   brw_dispatch_compute_common(ctx);
+}
+
+static void
+brw_dispatch_compute_group_size(struct gl_context *ctx,
+                                const GLuint *num_groups,
+                                const GLuint *group_size)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   brw->compute.num_work_groups_bo = NULL;
+   brw->compute.num_work_groups = num_groups;
+   brw->compute.group_size = group_size;
     ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
  
     brw_dispatch_compute_common(ctx);
@@ -130,4 +147,5 @@ brw_init_compute_functions(struct dd_function_table *functions)
  {
     functions->DispatchCompute = brw_dispatch_compute;
     functions->DispatchComputeIndirect = brw_dispatch_compute_indirect;
+   functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size;
  }
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c

index 5082657deacf3dc8f5156b7451f605a55a9ad5ad..e5b66a78d0c28e8771b5c9ce271e641d195ee00d 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -843,6 +843,24 @@ brw_initialize_cs_context_constants(struct brw_context *brw)
     ctx->Const.MaxComputeWorkGroupSize[2] = max_invocations;
     ctx->Const.MaxComputeWorkGroupInvocations = max_invocations;
     ctx->Const.MaxComputeSharedMemorySize = 64 * 1024;
+
+   /* Constants used for ARB_compute_variable_group_size.  The compiler will
+    * use the maximum to decide which SIMDs can be used.  If we top this like
+    * max_invocations, that would prevent SIMD8 / SIMD16 to be considered.
+    *
+    * TODO: To avoid the trade off above between having the lower maximum
+    * vs. always using SIMD32, keep all three shader variants (for each SIMD)
+    * and select a suitable one at dispatch time.
+    */
+   if (devinfo->gen >= 7) {
+      const uint32_t max_var_invocations =
+         (max_threads >= 64 ? 8 : (max_threads >= 32 ? 16 : 32)) * max_threads;
+      assert(max_var_invocations >= 512);
+      ctx->Const.MaxComputeVariableGroupSize[0] = max_var_invocations;
+      ctx->Const.MaxComputeVariableGroupSize[1] = max_var_invocations;
+      ctx->Const.MaxComputeVariableGroupSize[2] = max_var_invocations;
+      ctx->Const.MaxComputeVariableGroupInvocations = max_var_invocations;
+   }
  }
  
  /**
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 36dc53cc9d5b3b6baf9fe97aad613e9c08180e88..6f731f2371227a802e0a4931bb97d06a3288be3a 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -925,6 +925,11 @@ struct brw_context
        struct brw_bo *num_work_groups_bo;
        GLintptr num_work_groups_offset;
        const GLuint *num_work_groups;
+      /**
+       * This is only used alongside ARB_compute_variable_group_size when the
+       * local work group size is variable, otherwise it's NULL.
+       */
+      const GLuint *group_size;
     } compute;
  
     struct {
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c

index cf9340f15b6aaf15cab18ffaada0f7fbe578d2f5..5b4ff456a0dc92d98b94853d3c271249d06abcd2 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -32,6 +32,27 @@
  #include "brw_program.h"
  #include "compiler/glsl/ir_uniform.h"
  
+uint32_t
+brw_cs_group_size(const struct brw_context *brw)
+{
+   assert(brw->cs.base.prog_data);
+   struct brw_cs_prog_data *cs_prog_data =
+      brw_cs_prog_data(brw->cs.base.prog_data);
+
+   if (brw->compute.group_size) {
+      /* With ARB_compute_variable_group_size the group size is set at
+       * dispatch time, so we can't use the one provided by the compiler.
+       */
+      return brw->compute.group_size[0] *
+             brw->compute.group_size[1] *
+             brw->compute.group_size[2];
+   } else {
+      return cs_prog_data->local_size[0] *
+             cs_prog_data->local_size[1] *
+             cs_prog_data->local_size[2];
+   }
+}
+
  static void
  assign_cs_binding_table_offsets(const struct gen_device_info *devinfo,
                                  const struct gl_program *prog,
@@ -58,6 +79,7 @@ brw_codegen_cs_prog(struct brw_context *brw,
     struct brw_cs_prog_data prog_data;
     bool start_busy = false;
     double start_time = 0;
+   struct gl_context *gl_ctx = &brw->ctx;
     nir_shader *nir = nir_shader_clone(mem_ctx, cp->program.nir);
  
     memset(&prog_data, 0, sizeof(prog_data));
@@ -88,6 +110,17 @@ brw_codegen_cs_prog(struct brw_context *brw,
     if (INTEL_DEBUG & DEBUG_SHADER_TIME)
        st_index = brw_get_shader_time_index(brw, &cp->program, ST_CS, true);
  
+   /* If the work group size is variable we set it to the maximum here since
+    * the actual size is not known until the dispatch command is issued.
+    */
+   if (nir->info.cs.local_size_variable) {
+      prog_data.uses_variable_group_size = true;
+      nir->info.cs.max_variable_local_size =
+         gl_ctx->Const.MaxComputeWorkGroupSize[2];
+   } else {
+      prog_data.uses_variable_group_size = false;
+   }
+
     char *error_str;
     program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key,
                              &prog_data, nir, st_index, NULL, &error_str);
diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h

index a0d43abaae4b29cce6cd3b772f6168a390baadc0..9b0262000b6440d331794870577216c6a10d4e5c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_cs.h
+++ b/src/mesa/drivers/dri/i965/brw_cs.h
@@ -29,6 +29,9 @@
  extern "C" {
  #endif
  
+uint32_t
+brw_cs_group_size(const struct brw_context *brw);
+
  void
  brw_upload_cs_prog(struct brw_context *brw);
  
diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c

index 50e34fc9c8f8664654cfccf4922daa85701b1909..a4d82884b0181b3b099dae903809245080974d29 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen6_constant_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c
@@ -22,6 +22,7 @@
   */
  
  #include "brw_context.h"
+#include "brw_cs.h"
  #include "brw_state.h"
  #include "brw_defines.h"
  #include "brw_program.h"
@@ -62,6 +63,10 @@ brw_param_value(struct brw_context *brw,
           return f_as_u32(ctx->TessCtrlProgram.patch_default_inner_level[0]);
        } else if (param == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
           return f_as_u32(ctx->TessCtrlProgram.patch_default_inner_level[1]);
+      } else if (param >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
+                 param <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
+         unsigned i = param - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
+         return brw->compute.group_size[i];
        } else {
           unreachable("Invalid param builtin");
        }
@@ -303,8 +308,11 @@ brw_upload_cs_push_constants(struct brw_context *brw,
     /* XXX: Should this happen somewhere before to get our state flag set? */
     _mesa_load_state_parameters(ctx, prog->Parameters);
  
+   const unsigned threads =
+      DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size);
     const unsigned push_const_size =
-      brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads);
+      brw_cs_push_const_total_size(cs_prog_data, threads);
+
     if (push_const_size == 0) {
        stage_state->push_const_size = 0;
        return;
@@ -330,7 +338,7 @@ brw_upload_cs_push_constants(struct brw_context *brw,
     }
  
     if (cs_prog_data->push.per_thread.size > 0) {
-      for (unsigned t = 0; t < cs_prog_data->threads; t++) {
+      for (unsigned t = 0; t < threads; t++) {
           unsigned dst =
              8 * (cs_prog_data->push.per_thread.regs * t +
                   cs_prog_data->push.cross_thread.regs);
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c

index fed5eda8e483f4edc80df8667c55319b9c13bcf9..53c570742793ce5c75650c99bb43e02e520aca62 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -37,6 +37,7 @@
  #include "genX_boilerplate.h"
  
  #include "brw_context.h"
+#include "brw_cs.h"
  #include "brw_draw.h"
  #include "brw_multisample_state.h"
  #include "brw_state.h"
@@ -4263,6 +4264,12 @@ genX(upload_cs_state)(struct brw_context *brw)
     struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
     const struct gen_device_info *devinfo = &brw->screen->devinfo;
  
+   const unsigned threads =
+      DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size);
+
+   if (!cs_prog_data->uses_variable_group_size)
+      assert(cs_prog_data->threads == threads);
+
     if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
        brw_emit_buffer_surface_state(
           brw, &stage_state->surf_offset[
@@ -4353,13 +4360,13 @@ genX(upload_cs_state)(struct brw_context *brw)
        vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
  
        const uint32_t vfe_curbe_allocation =
-         ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
+         ALIGN(cs_prog_data->push.per_thread.regs * threads +
                 cs_prog_data->push.cross_thread.regs, 2);
        vfe.CURBEAllocationSize = vfe_curbe_allocation;
     }
  
     const unsigned push_const_size =
-      brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads);
+      brw_cs_push_const_total_size(cs_prog_data, threads);
     if (push_const_size > 0) {
        brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
           curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
@@ -4378,7 +4385,7 @@ genX(upload_cs_state)(struct brw_context *brw)
                        DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
        .BindingTablePointer = stage_state->bind_bo_offset,
        .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
-      .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
+      .NumberofThreadsinGPGPUThreadGroup = threads,
        .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
                                                 prog_data->total_shared),
        .BarrierEnable = cs_prog_data->uses_barrier,
@@ -4484,9 +4491,9 @@ genX(emit_gpgpu_walker)(struct brw_context *brw)
     if (indirect)
        prepare_indirect_gpgpu_walker(brw);
  
+   const unsigned group_size = brw_cs_group_size(brw);
     const unsigned simd_size = prog_data->simd_size;
-   unsigned group_size = prog_data->local_size[0] *
-      prog_data->local_size[1] * prog_data->local_size[2];
+   unsigned thread_width_max = DIV_ROUND_UP(group_size, simd_size);
  
     uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
     const unsigned right_non_aligned = group_size & (simd_size - 1);
@@ -4499,7 +4506,7 @@ genX(emit_gpgpu_walker)(struct brw_context *brw)
        ggw.SIMDSize                     = prog_data->simd_size / 16;
        ggw.ThreadDepthCounterMaximum    = 0;
        ggw.ThreadHeightCounterMaximum   = 0;
-      ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
+      ggw.ThreadWidthCounterMaximum    = thread_width_max - 1;
        ggw.ThreadGroupIDXDimension      = num_groups[0];
        ggw.ThreadGroupIDYDimension      = num_groups[1];
        ggw.ThreadGroupIDZDimension      = num_groups[2];
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c

index 2bf6708a152a5ad7b1a5e6e67281b9adaccb6c91..276f7aa3d079ef052b1d6ab196baaf9f0e4ce04f 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -271,6 +271,7 @@ intelInitExtensions(struct gl_context *ctx)
              ctx->Extensions.ARB_ES3_1_compatibility =
                 devinfo->gen >= 8 || devinfo->is_haswell;
              ctx->Extensions.NV_compute_shader_derivatives = true;
+            ctx->Extensions.ARB_compute_variable_group_size = true;
           }
  
           if (can_do_predicate_writes(brw->screen)) {
author	Plamena Manolova <plamena.n.manolova@gmail.com>
	Mon, 12 Nov 2018 14:29:51 +0000 (16:29 +0200)
committer	Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
	Fri, 10 Apr 2020 02:23:12 +0000 (19:23 -0700)
docs/features.txt		patch \| blob \| history
docs/relnotes/new_features.txt		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_compute.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_context.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_context.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_cs.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_cs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/gen6_constant_state.c		patch \| blob \| history
src/mesa/drivers/dri/i965/genX_state_upload.c		patch \| blob \| history
src/mesa/drivers/dri/i965/intel_extensions.c		patch \| blob \| history