radv: Track scratch usage across pipelines & command buffers.

author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Sun, 29 Jan 2017 14:20:03 +0000 (15:20 +0100)

committer Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Mon, 30 Jan 2017 01:07:16 +0000 (02:07 +0100)
author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 29 Jan 2017 14:20:03 +0000 (15:20 +0100)
committer Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Mon, 30 Jan 2017 01:07:16 +0000 (02:07 +0100)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c

index c62d275fd95d4d59dfcd3b668a0228b3f430cd6c..eebfac5fbf5b074b46681060c5da7af7deab3ea3 100644 (file)
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -627,6 +627,13 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer,
         radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
                                pipeline->graphics.prim_restart_enable);
  
+       cmd_buffer->scratch_size_needed =
+                                 MAX2(cmd_buffer->scratch_size_needed,
+                                      pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+
+       radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
+                              S_0286E8_WAVES(pipeline->max_waves) |
+                              S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
         cmd_buffer->state.emitted_pipeline = pipeline;
  }
  
@@ -1402,6 +1409,8 @@ static void  radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
                 free(up);
         }
  
+       cmd_buffer->scratch_size_needed = 0;
+       cmd_buffer->compute_scratch_size_needed = 0;
         if (cmd_buffer->upload.upload_bo)
                 cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
                                                       cmd_buffer->upload.upload_bo, 8);
@@ -1629,9 +1638,15 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
         radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
         radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
  
+
+       cmd_buffer->compute_scratch_size_needed =
+                                 MAX2(cmd_buffer->compute_scratch_size_needed,
+                                      pipeline->max_waves * pipeline->scratch_bytes_per_wave);
+
         /* change these once we have scratch support */
         radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-                         S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
+                         S_00B860_WAVES(pipeline->max_waves) |
+                         S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
  
         radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
         radeon_emit(cmd_buffer->cs,
@@ -1821,6 +1836,11 @@ void radv_CmdExecuteCommands(
         for (uint32_t i = 0; i < commandBufferCount; i++) {
                 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
  
+               primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
+                                                   secondary->scratch_size_needed);
+               primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
+                                                           secondary->compute_scratch_size_needed);
+
                 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
         }
  
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c

index ad83f9f4eb1deb9c4e04b25d1da43b701a88211f..da65511cf158c9a764f3c882f8aa51addfde7916 100644 (file)
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -813,6 +813,28 @@ VkResult radv_CreateDevice(
                 }
         }
  
+#if HAVE_LLVM < 0x0400
+       device->llvm_supports_spill = false;
+#else
+       device->llvm_supports_spill = true;
+#endif
+
+       /* The maximum number of scratch waves. Scratch space isn't divided
+        * evenly between CUs. The number is only a function of the number of CUs.
+        * We can decrease the constant to decrease the scratch buffer size.
+        *
+        * sctx->scratch_waves must be >= the maximum posible size of
+        * 1 threadgroup, so that the hw doesn't hang from being unable
+        * to start any.
+        *
+        * The recommended value is 4 per CU at most. Higher numbers don't
+        * bring much benefit, but they still occupy chip resources (think
+        * async compute). I've seen ~2% performance difference between 4 and 32.
+        */
+       uint32_t max_threads_per_block = 2048;
+       device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
+                                    max_threads_per_block / 64);
+
         result = radv_device_init_meta(device);
         if (result != VK_SUCCESS)
                 goto fail;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c

index 4d88ed77f93fc8278576dd995391bf06e9b092bb..e332877e2ba6b11633a11e44db32281533a8f875 100644 (file)
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -104,6 +104,19 @@ void radv_DestroyShaderModule(
         vk_free2(&device->alloc, pAllocator, module);
  }
  
+
+static void
+radv_pipeline_destroy(struct radv_device *device,
+                      struct radv_pipeline *pipeline,
+                      const VkAllocationCallbacks* allocator)
+{
+       for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
+               if (pipeline->shaders[i])
+                       radv_shader_variant_destroy(device, pipeline->shaders[i]);
+
+       vk_free2(&device->alloc, allocator, pipeline);
+}
+
  void radv_DestroyPipeline(
         VkDevice                                    _device,
         VkPipeline                                  _pipeline,
@@ -115,11 +128,7 @@ void radv_DestroyPipeline(
         if (!_pipeline)
                 return;
  
-       for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
-               if (pipeline->shaders[i])
-                       radv_shader_variant_destroy(device, pipeline->shaders[i]);
-
-       vk_free2(&device->alloc, pAllocator, pipeline);
+       radv_pipeline_destroy(device, pipeline, pAllocator);
  }
  
  
@@ -499,6 +508,48 @@ radv_pipeline_compile(struct radv_pipeline *pipeline,
         return variant;
  }
  
+static VkResult
+radv_pipeline_scratch_init(struct radv_device *device,
+                           struct radv_pipeline *pipeline)
+{
+       unsigned scratch_bytes_per_wave = 0;
+       unsigned max_waves = 0;
+       unsigned min_waves = 1;
+
+       for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
+               if (pipeline->shaders[i]) {
+                       unsigned max_stage_waves = device->scratch_waves;
+
+                       scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave,
+                                                     pipeline->shaders[i]->config.scratch_bytes_per_wave);
+
+                       max_stage_waves = MIN2(max_stage_waves,
+                                 4 * device->physical_device->rad_info.num_good_compute_units *
+                                 (256 / pipeline->shaders[i]->config.num_vgprs));
+                       max_waves = MAX2(max_waves, max_stage_waves);
+               }
+       }
+
+       if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
+               unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
+                                     pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
+                                     pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
+               min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
+       }
+
+       if (scratch_bytes_per_wave)
+               max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave);
+
+       if (scratch_bytes_per_wave && max_waves < min_waves) {
+               /* Not really true at this moment, but will be true on first
+                * execution. Avoid having hanging shaders. */
+               return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+       }
+       pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
+       pipeline->max_waves = max_waves;
+       return VK_SUCCESS;
+}
+
  static uint32_t si_translate_blend_function(VkBlendOp op)
  {
         switch (op) {
@@ -1313,6 +1364,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                    const VkAllocationCallbacks *alloc)
  {
         struct radv_shader_module fs_m = {0};
+       VkResult result;
  
         if (alloc == NULL)
                 alloc = &device->alloc;
@@ -1421,7 +1473,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
                 radv_dump_pipeline_stats(device, pipeline);
         }
  
-       return VK_SUCCESS;
+       result = radv_pipeline_scratch_init(device, pipeline);
+       return result;
  }
  
  VkResult
@@ -1447,7 +1500,7 @@ radv_graphics_pipeline_create(
         result = radv_pipeline_init(pipeline, device, cache,
                                     pCreateInfo, extra, pAllocator);
         if (result != VK_SUCCESS) {
-               vk_free2(&device->alloc, pAllocator, pipeline);
+               radv_pipeline_destroy(device, pipeline, pAllocator);
                 return result;
         }
  
@@ -1493,6 +1546,7 @@ static VkResult radv_compute_pipeline_create(
         RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
         RADV_FROM_HANDLE(radv_shader_module, module, pCreateInfo->stage.module);
         struct radv_pipeline *pipeline;
+       VkResult result;
  
         pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -1510,6 +1564,13 @@ static VkResult radv_compute_pipeline_create(
                                        pCreateInfo->stage.pSpecializationInfo,
                                        pipeline->layout, NULL);
  
+
+       result = radv_pipeline_scratch_init(device, pipeline);
+       if (result != VK_SUCCESS) {
+               radv_pipeline_destroy(device, pipeline, pAllocator);
+               return result;
+       }
+
         *pPipeline = radv_pipeline_to_handle(pipeline);
  
         if (device->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) {
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h

index 0b8f50a5d6d29d89c03ca55a0390a9cd8f78c197..88e05595380008ddd2622cd2bae72b31b672bd88 100644 (file)
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -485,6 +485,8 @@ struct radv_device {
  
         uint64_t debug_flags;
  
+       bool llvm_supports_spill;
+       uint32_t scratch_waves;
         /* MSAA sample locations.
          * The first index is the sample index.
          * The second index is the coordinate: X, Y. */
@@ -726,6 +728,9 @@ struct radv_cmd_buffer {
         struct radv_cmd_buffer_upload upload;
  
         bool record_fail;
+
+       uint32_t scratch_size_needed;
+       uint32_t compute_scratch_size_needed;
  };
  
  struct radv_image;
@@ -923,6 +928,9 @@ struct radv_pipeline {
                         bool prim_restart_enable;
                 } graphics;
         };
+
+       unsigned max_waves;
+       unsigned scratch_bytes_per_wave;
  };
  
  struct radv_graphics_pipeline_create_info {
author	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Sun, 29 Jan 2017 14:20:03 +0000 (15:20 +0100)
committer	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Mon, 30 Jan 2017 01:07:16 +0000 (02:07 +0100)
src/amd/vulkan/radv_cmd_buffer.c		patch \| blob \| history
src/amd/vulkan/radv_device.c		patch \| blob \| history
src/amd/vulkan/radv_pipeline.c		patch \| blob \| history
src/amd/vulkan/radv_private.h		patch \| blob \| history