radv: Do not change scratch settings while shaders are active.

author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Thu, 31 Oct 2019 21:53:30 +0000 (22:53 +0100)

committer Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Wed, 20 Nov 2019 01:18:36 +0000 (01:18 +0000)
author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Thu, 31 Oct 2019 21:53:30 +0000 (22:53 +0100)
committer Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 20 Nov 2019 01:18:36 +0000 (01:18 +0000)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c

index 87320e6d8222a95864d6bad319ecb5a5bc7b2361..a1c5d2b99b6b26a4e78eda4eccc319db31477a4b 100644 (file)
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -332,8 +332,10 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
         }
  
         cmd_buffer->push_constant_stages = 0;
-       cmd_buffer->scratch_size_needed = 0;
-       cmd_buffer->compute_scratch_size_needed = 0;
+       cmd_buffer->scratch_size_per_wave_needed = 0;
+       cmd_buffer->scratch_waves_wanted = 0;
+       cmd_buffer->compute_scratch_size_per_wave_needed = 0;
+       cmd_buffer->compute_scratch_waves_wanted = 0;
         cmd_buffer->esgs_ring_size_needed = 0;
         cmd_buffer->gsvs_ring_size_needed = 0;
         cmd_buffer->tess_rings_needed = false;
@@ -1147,9 +1149,10 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
         radv_update_multisample_state(cmd_buffer, pipeline);
         radv_update_binning_state(cmd_buffer, pipeline);
  
-       cmd_buffer->scratch_size_needed =
-                                 MAX2(cmd_buffer->scratch_size_needed,
-                                      pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+       cmd_buffer->scratch_size_per_wave_needed = MAX2(cmd_buffer->scratch_size_per_wave_needed,
+                                                       pipeline->scratch_bytes_per_wave);
+       cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted,
+                                               pipeline->max_waves);
  
         if (!cmd_buffer->state.emitted_pipeline ||
             cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
@@ -3678,9 +3681,10 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
         radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
         radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
  
-       cmd_buffer->compute_scratch_size_needed =
-                                 MAX2(cmd_buffer->compute_scratch_size_needed,
-                                      pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+       cmd_buffer->compute_scratch_size_per_wave_needed = MAX2(cmd_buffer->compute_scratch_size_per_wave_needed,
+                                                               pipeline->scratch_bytes_per_wave);
+       cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted,
+                                                       pipeline->max_waves);
  
         radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
                            pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
@@ -4009,10 +4013,14 @@ void radv_CmdExecuteCommands(
         for (uint32_t i = 0; i < commandBufferCount; i++) {
                 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
  
-               primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
-                                                   secondary->scratch_size_needed);
-               primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
-                                                           secondary->compute_scratch_size_needed);
+               primary->scratch_size_per_wave_needed = MAX2(primary->scratch_size_per_wave_needed,
+                                                            secondary->scratch_size_per_wave_needed);
+               primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted,
+                                                    secondary->scratch_waves_wanted);
+               primary->compute_scratch_size_per_wave_needed = MAX2(primary->compute_scratch_size_per_wave_needed,
+                                                                    secondary->compute_scratch_size_per_wave_needed);
+               primary->compute_scratch_waves_wanted = MAX2(primary->compute_scratch_waves_wanted,
+                                                            secondary->compute_scratch_waves_wanted);
  
                 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
                         primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c

index 15cc163c1401f619604aa20c495352d7de4d3c06..5917c80488391a09ed3d22a8aa15e4cbe99a198f 100644 (file)
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -3138,9 +3138,28 @@ radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_cmdbuf *cs,
         }
  }
  
+static void
+radv_emit_graphics_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs,
+                           uint32_t size_per_wave, uint32_t waves,
+                           struct radeon_winsys_bo *scratch_bo)
+{
+       if (queue->queue_family_index != RADV_QUEUE_GENERAL)
+               return;
+
+       if (!scratch_bo)
+               return;
+
+       radv_cs_add_buffer(queue->device->ws, cs, scratch_bo);
+
+       radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
+                              S_0286E8_WAVES(waves) |
+                              S_0286E8_WAVESIZE(round_up_u32(size_per_wave, 1024)));
+}
+
  static void
  radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs,
-                         struct radeon_winsys_bo *compute_scratch_bo)
+                          uint32_t size_per_wave, uint32_t waves,
+                          struct radeon_winsys_bo *compute_scratch_bo)
  {
         uint64_t scratch_va;
  
@@ -3155,6 +3174,10 @@ radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs,
         radeon_emit(cs, scratch_va);
         radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
                         S_008F04_SWIZZLE_ENABLE(1));
+
+       radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+                        S_00B860_WAVES(waves) |
+                        S_00B860_WAVESIZE(round_up_u32(size_per_wave, 1024)));
  }
  
  static void
@@ -3235,8 +3258,10 @@ radv_init_compute_state(struct radeon_cmdbuf *cs, struct radv_queue *queue)
  
  static VkResult
  radv_get_preamble_cs(struct radv_queue *queue,
-                     uint32_t scratch_size,
-                     uint32_t compute_scratch_size,
+                    uint32_t scratch_size_per_wave,
+                    uint32_t scratch_waves,
+                    uint32_t compute_scratch_size_per_wave,
+                    uint32_t compute_scratch_waves,
                      uint32_t esgs_ring_size,
                      uint32_t gsvs_ring_size,
                      bool needs_tess_rings,
@@ -3280,8 +3305,22 @@ radv_get_preamble_cs(struct radv_queue *queue,
         tess_offchip_ring_size = max_offchip_buffers *
                 queue->device->tess_offchip_block_dw_size * 4;
  
-       if (scratch_size <= queue->scratch_size &&
-           compute_scratch_size <= queue->compute_scratch_size &&
+       scratch_size_per_wave = MAX2(scratch_size_per_wave, queue->scratch_size_per_wave);
+       if (scratch_size_per_wave)
+               scratch_waves = MIN2(scratch_waves, UINT32_MAX / scratch_size_per_wave);
+       else
+               scratch_waves = 0;
+
+       compute_scratch_size_per_wave = MAX2(compute_scratch_size_per_wave, queue->compute_scratch_size_per_wave);
+       if (compute_scratch_size_per_wave)
+               compute_scratch_waves = MIN2(compute_scratch_waves, UINT32_MAX / compute_scratch_size_per_wave);
+       else
+               compute_scratch_waves = 0;
+
+       if (scratch_size_per_wave <= queue->scratch_size_per_wave &&
+           scratch_waves <= queue->scratch_waves &&
+           compute_scratch_size_per_wave <= queue->compute_scratch_size_per_wave &&
+           compute_scratch_waves <= queue->compute_scratch_waves &&
             esgs_ring_size <= queue->esgs_ring_size &&
             gsvs_ring_size <= queue->gsvs_ring_size &&
             !add_tess_rings && !add_gds && !add_sample_positions &&
@@ -3289,13 +3328,16 @@ radv_get_preamble_cs(struct radv_queue *queue,
                 *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs;
                 *initial_preamble_cs = queue->initial_preamble_cs;
                 *continue_preamble_cs = queue->continue_preamble_cs;
-               if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size &&
-                   !needs_tess_rings && !needs_gds && !needs_sample_positions)
+               if (!scratch_size_per_wave && !compute_scratch_size_per_wave &&
+                   !esgs_ring_size && !gsvs_ring_size && !needs_tess_rings &&
+                   !needs_gds && !needs_sample_positions)
                         *continue_preamble_cs = NULL;
                 return VK_SUCCESS;
         }
  
-       if (scratch_size > queue->scratch_size) {
+       uint32_t scratch_size = scratch_size_per_wave * scratch_waves;
+       uint32_t queue_scratch_size = queue->scratch_size_per_wave * queue->scratch_waves;
+       if (scratch_size > queue_scratch_size) {
                 scratch_bo = queue->device->ws->buffer_create(queue->device->ws,
                                                               scratch_size,
                                                               4096,
@@ -3307,7 +3349,9 @@ radv_get_preamble_cs(struct radv_queue *queue,
         } else
                 scratch_bo = queue->scratch_bo;
  
-       if (compute_scratch_size > queue->compute_scratch_size) {
+       uint32_t compute_scratch_size = compute_scratch_size_per_wave * compute_scratch_waves;
+       uint32_t compute_queue_scratch_size = queue->compute_scratch_size_per_wave * queue->compute_scratch_waves;
+       if (compute_scratch_size > compute_queue_scratch_size) {
                 compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws,
                                                                       compute_scratch_size,
                                                                       4096,
@@ -3475,7 +3519,10 @@ radv_get_preamble_cs(struct radv_queue *queue,
                 radv_emit_tess_factor_ring(queue, cs, hs_offchip_param,
                                            tess_factor_ring_size, tess_rings_bo);
                 radv_emit_global_shader_pointers(queue, cs, descriptor_bo);
-               radv_emit_compute_scratch(queue, cs, compute_scratch_bo);
+               radv_emit_compute_scratch(queue, cs, compute_scratch_size_per_wave,
+                                         compute_scratch_waves, compute_scratch_bo);
+               radv_emit_graphics_scratch(queue, cs, scratch_size_per_wave,
+                                          scratch_waves, scratch_bo);
  
                 if (gds_bo)
                         radv_cs_add_buffer(queue->device->ws, cs, gds_bo);
@@ -3528,15 +3575,17 @@ radv_get_preamble_cs(struct radv_queue *queue,
                 if (queue->scratch_bo)
                         queue->device->ws->buffer_destroy(queue->scratch_bo);
                 queue->scratch_bo = scratch_bo;
-               queue->scratch_size = scratch_size;
         }
+       queue->scratch_size_per_wave = scratch_size_per_wave;
+       queue->scratch_waves = scratch_waves;
  
         if (compute_scratch_bo != queue->compute_scratch_bo) {
                 if (queue->compute_scratch_bo)
                         queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
                 queue->compute_scratch_bo = compute_scratch_bo;
-               queue->compute_scratch_size = compute_scratch_size;
         }
+       queue->compute_scratch_size_per_wave = compute_scratch_size_per_wave;
+       queue->compute_scratch_waves = compute_scratch_waves;
  
         if (esgs_ring_bo != queue->esgs_ring_bo) {
                 if (queue->esgs_ring_bo)
@@ -3832,8 +3881,8 @@ radv_get_preambles(struct radv_queue *queue,
                     struct radeon_cmdbuf **initial_preamble_cs,
                     struct radeon_cmdbuf **continue_preamble_cs)
  {
-       uint32_t scratch_size = 0;
-       uint32_t compute_scratch_size = 0;
+       uint32_t scratch_size_per_wave = 0, waves_wanted = 0;
+       uint32_t compute_scratch_size_per_wave = 0, compute_waves_wanted = 0;
         uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
         bool tess_rings_needed = false;
         bool gds_needed = false;
@@ -3843,9 +3892,12 @@ radv_get_preambles(struct radv_queue *queue,
                 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
                                  cmd_buffers[j]);
  
-               scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
-               compute_scratch_size = MAX2(compute_scratch_size,
-                                           cmd_buffer->compute_scratch_size_needed);
+               scratch_size_per_wave = MAX2(scratch_size_per_wave, cmd_buffer->scratch_size_per_wave_needed);
+               waves_wanted = MAX2(waves_wanted, cmd_buffer->scratch_waves_wanted);
+               compute_scratch_size_per_wave = MAX2(compute_scratch_size_per_wave,
+                                                    cmd_buffer->compute_scratch_size_per_wave_needed);
+               compute_waves_wanted = MAX2(compute_waves_wanted,
+                                           cmd_buffer->compute_scratch_waves_wanted);
                 esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
                 gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
                 tess_rings_needed |= cmd_buffer->tess_rings_needed;
@@ -3853,11 +3905,12 @@ radv_get_preambles(struct radv_queue *queue,
                 sample_positions_needed |= cmd_buffer->sample_positions_needed;
         }
  
-       return radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
-                                     esgs_ring_size, gsvs_ring_size, tess_rings_needed,
-                                     gds_needed, sample_positions_needed,
-                                     initial_full_flush_preamble_cs,
-                                     initial_preamble_cs, continue_preamble_cs);
+       return radv_get_preamble_cs(queue, scratch_size_per_wave, waves_wanted,
+                                   compute_scratch_size_per_wave, compute_waves_wanted,
+                                   esgs_ring_size, gsvs_ring_size, tess_rings_needed,
+                                   gds_needed, sample_positions_needed,
+                                   initial_full_flush_preamble_cs,
+                                   initial_preamble_cs, continue_preamble_cs);
  }
  
  struct radv_deferred_queue_submission {
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c

index 4579bf10e454131a31ec7c26d224ead05c2b9fc7..b04b24a13c12d9584936fe93f14d8a5492ac2f02 100644 (file)
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -180,7 +180,8 @@ radv_pipeline_scratch_init(struct radv_device *device,
         unsigned min_waves = 1;
  
         for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
-               if (pipeline->shaders[i]) {
+               if (pipeline->shaders[i] &&
+                   pipeline->shaders[i]->config.scratch_bytes_per_wave) {
                         unsigned max_stage_waves = device->scratch_waves;
  
                         scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave,
@@ -200,14 +201,6 @@ radv_pipeline_scratch_init(struct radv_device *device,
                 min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
         }
  
-       if (scratch_bytes_per_wave)
-               max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave);
-
-       if (scratch_bytes_per_wave && max_waves < min_waves) {
-               /* Not really true at this moment, but will be true on first
-                * execution. Avoid having hanging shaders. */
-               return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-       }
         pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
         pipeline->max_waves = max_waves;
         return VK_SUCCESS;
@@ -4481,10 +4474,6 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
         if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline))
                 gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess);
  
-       radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE,
-                              S_0286E8_WAVES(pipeline->max_waves) |
-                              S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
-
         radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, radv_compute_vgt_shader_stages_en(pipeline));
  
         if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
@@ -5072,10 +5061,6 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
                 radeon_set_sh_reg(&pipeline->cs, R_00B8A0_COMPUTE_PGM_RSRC3, compute_shader->config.rsrc3);
         }
  
-       radeon_set_sh_reg(&pipeline->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-                         S_00B860_WAVES(pipeline->max_waves) |
-                         S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
-
         /* Calculate best compute resource limits. */
         threads_per_threadgroup = compute_shader->info.cs.block_size[0] *
                                   compute_shader->info.cs.block_size[1] *
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h

index ea434ec16f89df0c029f0fe6e8672ce247cc0737..e4ea4d25635f64715f212ced0e1cbcaa31ceacef 100644 (file)
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -712,8 +712,10 @@ struct radv_queue {
         int queue_idx;
         VkDeviceQueueCreateFlags flags;
  
-       uint32_t scratch_size;
-       uint32_t compute_scratch_size;
+       uint32_t scratch_size_per_wave;
+       uint32_t scratch_waves;
+       uint32_t compute_scratch_size_per_wave;
+       uint32_t compute_scratch_waves;
         uint32_t esgs_ring_size;
         uint32_t gsvs_ring_size;
         bool has_tess_rings;
@@ -1309,8 +1311,10 @@ struct radv_cmd_buffer {
  
         struct radv_cmd_buffer_upload upload;
  
-       uint32_t scratch_size_needed;
-       uint32_t compute_scratch_size_needed;
+       uint32_t scratch_size_per_wave_needed;
+       uint32_t scratch_waves_wanted;
+       uint32_t compute_scratch_size_per_wave_needed;
+       uint32_t compute_scratch_waves_wanted;
         uint32_t esgs_ring_size_needed;
         uint32_t gsvs_ring_size_needed;
         bool tess_rings_needed;
author	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Thu, 31 Oct 2019 21:53:30 +0000 (22:53 +0100)
committer	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Wed, 20 Nov 2019 01:18:36 +0000 (01:18 +0000)
src/amd/vulkan/radv_cmd_buffer.c		patch \| blob \| history
src/amd/vulkan/radv_device.c		patch \| blob \| history
src/amd/vulkan/radv_pipeline.c		patch \| blob \| history
src/amd/vulkan/radv_private.h		patch \| blob \| history