aco: Fix workgroup size calculation.

author Timur Kristóf <timur.kristof@gmail.com>

Thu, 12 Mar 2020 15:28:48 +0000 (16:28 +0100)

committer Marge Bot <eric+marge@anholt.net>

Mon, 30 Mar 2020 13:09:08 +0000 (13:09 +0000)
author Timur Kristóf <timur.kristof@gmail.com>
Thu, 12 Mar 2020 15:28:48 +0000 (16:28 +0100)
committer Marge Bot <eric+marge@anholt.net>
Mon, 30 Mar 2020 13:09:08 +0000 (13:09 +0000)
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp

index 254eb97d1510c2038e9155115c6557b61cf61ccb..09556d232b520f269a4ee51892cde47c2b436ef4 100644 (file)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -403,17 +403,12 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
     }
  
     if (instr->format == Format::PSEUDO_BARRIER) {
-      uint32_t workgroup_size = UINT32_MAX;
-      if (ctx.program->stage & sw_cs) {
-         unsigned* bsize = ctx.program->info->cs.block_size;
-         workgroup_size = bsize[0] * bsize[1] * bsize[2];
-      }
        switch (instr->opcode) {
        case aco_opcode::p_memory_barrier_common:
           imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
           imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
           imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         if (workgroup_size > ctx.program->wave_size)
+         if (ctx.program->workgroup_size > ctx.program->wave_size)
              imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
           break;
        case aco_opcode::p_memory_barrier_atomic:
@@ -426,7 +421,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
           imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
           break;
        case aco_opcode::p_memory_barrier_shared:
-         if (workgroup_size > ctx.program->wave_size)
+         if (ctx.program->workgroup_size > ctx.program->wave_size)
              imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
           break;
        case aco_opcode::p_memory_barrier_gs_data:
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 4ec971e4d6ceed93c055502a7b67a32a23cb0705..c2da6d6e2389a42febc22cc4ce17ef1f48c87a77 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -6827,22 +6827,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           break;
        }
  
-      if (ctx->shader->info.stage == MESA_SHADER_COMPUTE) {
-         unsigned* bsize = ctx->program->info->cs.block_size;
-         unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
-         if (workgroup_size > ctx->program->wave_size)
-            bld.sopp(aco_opcode::s_barrier);
-      } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         /* For each patch provided during rendering, n TCS shader invocations will be processed,
-          * where n is the number of vertices in the output patch.
-          */
-         unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
-         if (workgroup_size > ctx->program->wave_size)
-            bld.sopp(aco_opcode::s_barrier);
-      } else {
-         /* We don't know the workgroup size, so always emit the s_barrier. */
+      if (ctx->program->workgroup_size > ctx->program->wave_size)
           bld.sopp(aco_opcode::s_barrier);
-      }
  
        break;
     }
@@ -9374,8 +9360,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
     Builder bld(ctx->program, ctx->block);
  
     bld.barrier(aco_opcode::p_memory_barrier_shared);
-   unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
-   if (unlikely(ctx->program->chip_class != GFX6 && workgroup_size > ctx->program->wave_size))
+   if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
        bld.sopp(aco_opcode::s_barrier);
  
     Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp

index 75f1f9b48811411c76101a32896ef8c0e67a02e7..bd90dcae83d9b5c83b866577a6711510a2448897 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1238,22 +1238,45 @@ setup_isel_context(Program* program,
        program->sgpr_limit = 104;
     }
  
-   calc_min_waves(program);
-   program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
-   program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
-
     isel_context ctx = {};
     ctx.program = program;
     ctx.args = args;
     ctx.options = args->options;
     ctx.stage = program->stage;
  
-   if (ctx.stage == tess_control_hs) {
+   /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
+   if (program->stage & (hw_vs | hw_fs)) {
+      /* PS and legacy VS have separate waves, no workgroups */
+      program->workgroup_size = program->wave_size;
+   } else if (program->stage == compute_cs) {
+      /* CS sets the workgroup size explicitly */
+      unsigned* bsize = program->info->cs.block_size;
+      program->workgroup_size = bsize[0] * bsize[1] * bsize[2];
+   } else if ((program->stage & hw_es) || program->stage == geometry_gs) {
+      /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa)  */
+      program->workgroup_size = program->wave_size;
+   } else if (program->stage & hw_gs) {
+      /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
+      program->workgroup_size = UINT_MAX; /* TODO: set by VGT_GS_ONCHIP_CNTL, which is not plumbed to ACO */
+   } else if (program->stage == vertex_ls) {
+      /* Unmerged LS operates in workgroups */
+      program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */
+   } else if (program->stage == tess_control_hs) {
+      /* Unmerged HS operates in workgroups, size is determined by the output vertices */
        setup_tcs_info(&ctx, shaders[0]);
-   } else if (ctx.stage == vertex_tess_control_hs) {
+      program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;
+   } else if (program->stage == vertex_tess_control_hs) {
+      /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */
        setup_tcs_info(&ctx, shaders[1]);
+      program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
+   } else {
+      unreachable("Unsupported shader stage.");
     }
  
+   calc_min_waves(program);
+   program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+   program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
+
     get_io_masks(&ctx, shader_count, shaders);
  
     unsigned scratch_size = 0;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index 0be646d8b0f834c4153c4e8dbc0da7c220c6adfc..73a1d394eff509bfc08cd179745d66c999d6291c 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1250,6 +1250,7 @@ public:
     uint16_t physical_sgprs;
     uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
     uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
+   unsigned workgroup_size; /* if known; otherwise UINT_MAX */
  
     bool needs_vcc = false;
     bool needs_xnack_mask = false;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp

index d4383cf588745abb4fdca4a2eb2461417879fd8d..e223d6d5f841070f1f3f33ad0cfa153c76256add 100644 (file)
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -289,11 +289,11 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
  
  unsigned calc_waves_per_workgroup(Program *program)
  {
-   unsigned workgroup_size = program->wave_size;
-   if (program->stage == compute_cs) {
-      unsigned* bsize = program->info->cs.block_size;
-      workgroup_size = bsize[0] * bsize[1] * bsize[2];
-   }
+   /* When workgroup size is not known, just go with wave_size */
+   unsigned workgroup_size = program->workgroup_size == UINT_MAX
+                             ? program->wave_size
+                             : program->workgroup_size;
+
     return align(workgroup_size, program->wave_size) / program->wave_size;
  }
  } /* end namespace */
author	Timur Kristóf <timur.kristof@gmail.com>
	Thu, 12 Mar 2020 15:28:48 +0000 (16:28 +0100)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 30 Mar 2020 13:09:08 +0000 (13:09 +0000)
src/amd/compiler/aco_insert_waitcnt.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection_setup.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_live_var_analysis.cpp		patch \| blob \| history