From: Jason Ekstrand <jason@jlekstrand.net>
Date: Wed, 24 Apr 2019 08:02:35 +0000 (-0500)
Subject: anv: Implement VK_KHR_pipeline_executable_properties
X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=d787a2d05e20642a7ec52ce80a830c7795a6bdc0

anv: Implement VK_KHR_pipeline_executable_properties

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index c4e27433a49..9154c1ab5a7 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1126,6 +1126,13 @@ void anv_GetPhysicalDeviceFeatures2(
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
+         VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
+            (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
+         features->pipelineExecutableInfo = true;
+         break;
+      }
+
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
          VkPhysicalDeviceProtectedMemoryFeatures *features = (void *)ext;
          features->protectedMemory = false;
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index 2c2accbe566..7e7ce428987 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -101,6 +101,7 @@ EXTENSIONS = [
     Extension('VK_KHR_maintenance2',                      1, True),
     Extension('VK_KHR_maintenance3',                      1, True),
     Extension('VK_KHR_multiview',                         1, True),
+    Extension('VK_KHR_pipeline_executable_properties',    1, True),
     Extension('VK_KHR_push_descriptor',                   1, True),
     Extension('VK_KHR_relaxed_block_layout',              1, True),
     Extension('VK_KHR_sampler_mirror_clamp_to_edge',      1, True),
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index fc492844cce..6d705b88e08 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -30,6 +30,7 @@
 #include "util/mesa-sha1.h"
 #include "util/os_time.h"
 #include "common/gen_l3_config.h"
+#include "common/gen_disasm.h"
 #include "anv_private.h"
 #include "compiler/brw_nir.h"
 #include "anv_nir.h"
@@ -529,6 +530,7 @@ struct anv_pipeline_stage {
 
    uint32_t num_stats;
    struct brw_compile_stats stats[3];
+   char *disasm[3];
 
    VkPipelineCreationFeedbackEXT feedback;
 
@@ -1063,6 +1065,77 @@ anv_pipeline_compile_fs(const struct brw_compiler *compiler,
    }
 }
 
+static void
+anv_pipeline_add_executable(struct anv_pipeline *pipeline,
+                            struct anv_pipeline_stage *stage,
+                            struct brw_compile_stats *stats,
+                            uint32_t code_offset)
+{
+   char *disasm = NULL;
+   if (stage->code &&
+       (pipeline->flags &
+        VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+      char *stream_data = NULL;
+      size_t stream_size = 0;
+      FILE *stream = open_memstream(&stream_data, &stream_size);
+
+      /* Creating this is far cheaper than it looks.  It's perfectly fine to
+       * do it for every binary.
+       */
+      struct gen_disasm *d = gen_disasm_create(&pipeline->device->info);
+      gen_disasm_disassemble(d, stage->code, code_offset, stream);
+      gen_disasm_destroy(d);
+
+      fclose(stream);
+
+      /* Copy it to a ralloc'd thing */
+      disasm = ralloc_size(pipeline->mem_ctx, stream_size + 1);
+      memcpy(disasm, stream_data, stream_size);
+      disasm[stream_size] = 0;
+
+      free(stream_data);
+   }
+
+   pipeline->executables[pipeline->num_executables++] =
+      (struct anv_pipeline_executable) {
+         .stage = stage->stage,
+         .stats = *stats,
+         .disasm = disasm,
+      };
+}
+
+static void
+anv_pipeline_add_executables(struct anv_pipeline *pipeline,
+                             struct anv_pipeline_stage *stage,
+                             struct anv_shader_bin *bin)
+{
+   if (stage->stage == MESA_SHADER_FRAGMENT) {
+      /* We pull the prog data and stats out of the anv_shader_bin because
+       * the anv_pipeline_stage may not be fully populated if we successfully
+       * looked up the shader in a cache.
+       */
+      const struct brw_wm_prog_data *wm_prog_data =
+         (const struct brw_wm_prog_data *)bin->prog_data;
+      struct brw_compile_stats *stats = bin->stats;
+
+      if (wm_prog_data->dispatch_8) {
+         anv_pipeline_add_executable(pipeline, stage, stats++, 0);
+      }
+
+      if (wm_prog_data->dispatch_16) {
+         anv_pipeline_add_executable(pipeline, stage, stats++,
+                                     wm_prog_data->prog_offset_16);
+      }
+
+      if (wm_prog_data->dispatch_32) {
+         anv_pipeline_add_executable(pipeline, stage, stats++,
+                                     wm_prog_data->prog_offset_32);
+      }
+   } else {
+      anv_pipeline_add_executable(pipeline, stage, bin->stats, 0);
+   }
+}
+
 static VkResult
 anv_pipeline_compile_graphics(struct anv_pipeline *pipeline,
                               struct anv_pipeline_cache *cache,
@@ -1182,6 +1255,13 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline,
                VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
          }
          /* We found all our shaders in the cache.  We're done. */
+         for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
+            if (!stages[s].entrypoint)
+               continue;
+
+            anv_pipeline_add_executables(pipeline, &stages[s],
+                                         pipeline->shaders[s]);
+         }
          goto done;
       } else if (found > 0) {
          /* We found some but not all of our shaders.  This shouldn't happen
@@ -1335,6 +1415,8 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline,
          goto fail;
       }
 
+      anv_pipeline_add_executables(pipeline, &stages[s], bin);
+
       pipeline->shaders[s] = bin;
       ralloc_free(stage_ctx);
 
@@ -1455,6 +1537,7 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
                                          &cache_hit);
    }
 
+   void *mem_ctx = ralloc_context(NULL);
    if (bin == NULL) {
       int64_t stage_start = os_time_get_nano();
 
@@ -1469,8 +1552,6 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
          .set = ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS,
       };
 
-      void *mem_ctx = ralloc_context(NULL);
-
       stage.nir = anv_pipeline_stage_get_nir(pipeline, cache, mem_ctx, &stage);
       if (stage.nir == NULL) {
          ralloc_free(mem_ctx);
@@ -1511,11 +1592,13 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
-      ralloc_free(mem_ctx);
-
       stage.feedback.duration = os_time_get_nano() - stage_start;
    }
 
+   anv_pipeline_add_executables(pipeline, &stage, bin);
+
+   ralloc_free(mem_ctx);
+
    if (cache_hit) {
       stage.feedback.flags |=
          VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
@@ -1823,6 +1906,7 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
     * of various prog_data pointers.  Make them NULL by default.
     */
    memset(pipeline->shaders, 0, sizeof(pipeline->shaders));
+   pipeline->num_executables = 0;
 
    result = anv_pipeline_compile_graphics(pipeline, cache, pCreateInfo);
    if (result != VK_SUCCESS) {
@@ -1909,3 +1993,187 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
 
    return VK_SUCCESS;
 }
+
+#define WRITE_STR(field, ...) ({                               \
+   memset(field, 0, sizeof(field));                            \
+   UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
+   assert(i > 0 && i < sizeof(field));                         \
+})
+
+VkResult anv_GetPipelineExecutablePropertiesKHR(
+    VkDevice                                    device,
+    const VkPipelineInfoKHR*                    pPipelineInfo,
+    uint32_t*                                   pExecutableCount,
+    VkPipelineExecutablePropertiesKHR*          pProperties)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, pPipelineInfo->pipeline);
+   VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount);
+
+   for (uint32_t i = 0; i < pipeline->num_executables; i++) {
+      vk_outarray_append(&out, props) {
+         gl_shader_stage stage = pipeline->executables[i].stage;
+         props->stages = mesa_to_vk_shader_stage(stage);
+
+         unsigned simd_width = pipeline->executables[i].stats.dispatch_width;
+         if (stage == MESA_SHADER_FRAGMENT) {
+            WRITE_STR(props->name, "%s%d %s",
+                      simd_width ? "SIMD" : "vec",
+                      simd_width ? simd_width : 4,
+                      _mesa_shader_stage_to_string(stage));
+         } else {
+            WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(stage));
+         }
+         WRITE_STR(props->description, "%s%d %s shader",
+                   simd_width ? "SIMD" : "vec",
+                   simd_width ? simd_width : 4,
+                   _mesa_shader_stage_to_string(stage));
+
+         /* The compiler gives us a dispatch width of 0 for vec4 but Vulkan
+          * wants a subgroup size of 1.
+          */
+         props->subgroupSize = MAX2(simd_width, 1);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+VkResult anv_GetPipelineExecutableStatisticsKHR(
+    VkDevice                                    device,
+    const VkPipelineExecutableInfoKHR*          pExecutableInfo,
+    uint32_t*                                   pStatisticCount,
+    VkPipelineExecutableStatisticKHR*           pStatistics)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
+   VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount);
+
+   assert(pExecutableInfo->executableIndex < pipeline->num_executables);
+   const struct anv_pipeline_executable *exe =
+      &pipeline->executables[pExecutableInfo->executableIndex];
+   const struct brw_stage_prog_data *prog_data =
+      pipeline->shaders[exe->stage]->prog_data;
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Instruction Count");
+      WRITE_STR(stat->description,
+                "Number of GEN instructions in the final generated "
+                "shader executable.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.instructions;
+   }
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Loop Count");
+      WRITE_STR(stat->description,
+                "Number of loops (not unrolled) in the final generated "
+                "shader executable.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.loops;
+   }
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Cycle Count");
+      WRITE_STR(stat->description,
+                "Estimate of the number of EU cycles required to execute "
+                "the final generated executable.  This is an estimate only "
+                "and may vary greatly from actual run-time performance.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.cycles;
+   }
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Spill Count");
+      WRITE_STR(stat->description,
+                "Number of scratch spill operations.  This gives a rough "
+                "estimate of the cost incurred due to spilling temporary "
+                "values to memory.  If this is non-zero, you may want to "
+                "adjust your shader to reduce register pressure.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.spills;
+   }
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Fill Count");
+      WRITE_STR(stat->description,
+                "Number of scratch fill operations.  This gives a rough "
+                "estimate of the cost incurred due to spilling temporary "
+                "values to memory.  If this is non-zero, you may want to "
+                "adjust your shader to reduce register pressure.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.fills;
+   }
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Scratch Memory Size");
+      WRITE_STR(stat->description,
+                "Number of bytes of scratch memory required by the "
+                "generated shader executable.  If this is non-zero, you "
+                "may want to adjust your shader to reduce register "
+                "pressure.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = prog_data->total_scratch;
+   }
+
+   if (exe->stage == MESA_SHADER_COMPUTE) {
+      vk_outarray_append(&out, stat) {
+         WRITE_STR(stat->name, "Workgroup Memory Size");
+         WRITE_STR(stat->description,
+                   "Number of bytes of workgroup shared memory used by this "
+                   "compute shader including any padding.");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->total_scratch;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static bool
+write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
+              const char *data)
+{
+   ir->isText = VK_TRUE;
+
+   size_t data_len = strlen(data) + 1;
+
+   if (ir->pData == NULL) {
+      ir->dataSize = data_len;
+      return true;
+   }
+
+   strncpy(ir->pData, data, ir->dataSize);
+   if (ir->dataSize < data_len)
+      return false;
+
+   ir->dataSize = data_len;
+   return true;
+}
+
+VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
+    VkDevice                                    device,
+    const VkPipelineExecutableInfoKHR*          pExecutableInfo,
+    uint32_t*                                   pInternalRepresentationCount,
+    VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
+   VK_OUTARRAY_MAKE(out, pInternalRepresentations,
+                    pInternalRepresentationCount);
+   bool incomplete_text = false;
+
+   assert(pExecutableInfo->executableIndex < pipeline->num_executables);
+   const struct anv_pipeline_executable *exe =
+      &pipeline->executables[pExecutableInfo->executableIndex];
+
+   if (exe->disasm) {
+      vk_outarray_append(&out, ir) {
+         WRITE_STR(ir->name, "GEN Assembly");
+         WRITE_STR(ir->description,
+                   "Final GEN assembly for the generated shader binary");
+
+         if (!write_ir_text(ir, exe->disasm))
+            incomplete_text = true;
+      }
+   }
+
+   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
+}
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 5682e838abf..85cf7ea9b6d 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2794,6 +2794,17 @@ anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
       anv_shader_bin_destroy(device, shader);
 }
 
+/* 5 possible simultaneous shader stages and FS may have up to 3 binaries */
+#define MAX_PIPELINE_EXECUTABLES 7
+
+struct anv_pipeline_executable {
+   gl_shader_stage stage;
+
+   struct brw_compile_stats stats;
+
+   char *disasm;
+};
+
 struct anv_pipeline {
    struct anv_device *                          device;
    struct anv_batch                             batch;
@@ -2811,6 +2822,9 @@ struct anv_pipeline {
 
    struct anv_shader_bin *                      shaders[MESA_SHADER_STAGES];
 
+   uint32_t                                     num_executables;
+   struct anv_pipeline_executable               executables[MAX_PIPELINE_EXECUTABLES];
+
    struct {
       const struct gen_l3_config *              l3_config;
       uint32_t                                  total_size;
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index e0e723c13a4..88f5b358faf 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -2233,6 +2233,7 @@ compute_pipeline_create(
     * of various prog_data pointers.  Make them NULL by default.
     */
    memset(pipeline->shaders, 0, sizeof(pipeline->shaders));
+   pipeline->num_executables = 0;
 
    pipeline->needs_data_cache = false;