turnip: Add support for compute shaders.
authorEric Anholt <eric@anholt.net>
Wed, 27 Nov 2019 04:37:19 +0000 (20:37 -0800)
committerEric Anholt <eric@anholt.net>
Thu, 5 Dec 2019 04:32:15 +0000 (20:32 -0800)
Since compute shares the FS state with graphics, we have to re-upload the
pipeline state when switching between compute dispatch and graphics draws.
We could potentially expose graphics and compute as separate queues and
then we wouldn't need pipeline state management, but the closed driver
exposes a single queue and consistency with them is probably good.

So far I'm emitting texture/ibo state as IBs that we jump to.  This is
kind of silly when we could just emit it directly in our CS, but that's a
refactor we can do later.

Reviewed-by: Jonathan Marek <jonathan@marek.ca>
src/freedreno/vulkan/tu_cmd_buffer.c
src/freedreno/vulkan/tu_pipeline.c
src/freedreno/vulkan/tu_private.h

index a32495103f1682e62feae57d2b3bec3706073fb1..5c6eeb2c50fd3f6137b1ca7785939db9e253e3e4 100644 (file)
@@ -1809,7 +1809,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
       cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
       break;
    case VK_PIPELINE_BIND_POINT_COMPUTE:
-      tu_finishme("binding compute pipeline");
+      cmd->state.compute_pipeline = pipeline;
+      cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
       break;
    default:
       unreachable("unrecognized pipeline bind point");
@@ -2557,13 +2558,17 @@ tu6_emit_ibo(struct tu_device *device, struct tu_cs *draw_state,
    /* emit texture state: */
    tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6, 3);
    tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-              CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
+              CP_LOAD_STATE6_0_STATE_TYPE(type == MESA_SHADER_COMPUTE ?
+                                          ST6_IBO : ST6_SHADER) |
               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-              CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
+              CP_LOAD_STATE6_0_STATE_BLOCK(type == MESA_SHADER_COMPUTE ?
+                                           SB6_CS_SHADER : SB6_IBO) |
               CP_LOAD_STATE6_0_NUM_UNIT(link->image_mapping.num_ibo));
    tu_cs_emit_qw(&cs, ibo_addr); /* SRC_ADDR_LO/HI */
 
-   tu_cs_emit_pkt4(&cs, REG_A6XX_SP_IBO_LO, 2);
+   tu_cs_emit_pkt4(&cs,
+                   type == MESA_SHADER_COMPUTE ?
+                   REG_A6XX_SP_IBO_LO : REG_A6XX_SP_CS_IBO_LO, 2);
    tu_cs_emit_qw(&cs, ibo_addr); /* SRC_ADDR_LO/HI */
 
    return tu_cs_end_sub_stream(draw_state, &cs);
@@ -2806,7 +2811,11 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
             }
       }
    }
-   cmd->state.dirty = 0;
+
+   /* Fragment shader state overwrites compute shader state, so flag the
+    * compute pipeline for re-emit.
+    */
+   cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
 }
 
 static void
@@ -2989,9 +2998,156 @@ struct tu_dispatch_info
 };
 
 static void
-tu_dispatch(struct tu_cmd_buffer *cmd_buffer,
+tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
+                              const struct tu_dispatch_info *info)
+{
+   gl_shader_stage type = MESA_SHADER_COMPUTE;
+   const struct tu_program_descriptor_linkage *link =
+      &pipeline->program.link[type];
+   const struct ir3_const_state *const_state = &link->const_state;
+   uint32_t offset_dwords = const_state->offsets.driver_param;
+
+   if (link->constlen <= offset_dwords)
+      return;
+
+   if (!info->indirect) {
+      uint32_t driver_params[] = {
+         info->blocks[0],
+         info->blocks[1],
+         info->blocks[2],
+         pipeline->compute.local_size[0],
+         pipeline->compute.local_size[1],
+         pipeline->compute.local_size[2],
+      };
+      uint32_t num_consts = MIN2(const_state->num_driver_params,
+                                 link->constlen - offset_dwords);
+      uint32_t align_size = align(num_consts, 4);
+
+      /* push constants */
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + align_size);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset_dwords / 4) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(align_size / 4));
+      tu_cs_emit(cs, 0);
+      tu_cs_emit(cs, 0);
+      uint32_t i;
+      for (i = 0; i < num_consts; i++)
+         tu_cs_emit(cs, driver_params[i]);
+      for (; i < align_size; i++)
+         tu_cs_emit(cs, 0);
+   } else {
+      tu_finishme("Indirect driver params");
+   }
+}
+
+static void
+tu_dispatch(struct tu_cmd_buffer *cmd,
             const struct tu_dispatch_info *info)
 {
+   struct tu_cs *cs = &cmd->cs;
+   struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
+   struct tu_descriptor_state *descriptors_state =
+      &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
+
+   VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
+   if (result != VK_SUCCESS) {
+      cmd->record_result = result;
+      return;
+   }
+
+   if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
+      tu_cs_emit_ib(cs, &pipeline->program.state_ib);
+
+   struct tu_cs_entry ib;
+
+   ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   tu_emit_compute_driver_params(cs, pipeline, info);
+
+   bool needs_border;
+   ib = tu6_emit_textures(cmd->device, &cmd->draw_state, pipeline,
+                          descriptors_state, MESA_SHADER_COMPUTE,
+                          &needs_border);
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   if (needs_border)
+      tu6_emit_border_color(cmd, cs);
+
+   ib = tu6_emit_ibo(cmd->device, &cmd->draw_state, pipeline,
+                     descriptors_state, MESA_SHADER_COMPUTE);
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   /* track BOs */
+   if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
+      unsigned i;
+      for_each_bit(i, descriptors_state->valid) {
+         struct tu_descriptor_set *set = descriptors_state->sets[i];
+         for (unsigned j = 0; j < set->layout->buffer_count; ++j)
+            if (set->descriptors[j]) {
+               tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
+                              MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+            }
+      }
+   }
+
+   /* Compute shader state overwrites fragment shader state, so we flag the
+    * graphics pipeline for re-emit.
+    */
+   cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
+
+   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x8));
+
+   const uint32_t *local_size = pipeline->compute.local_size;
+   const uint32_t *num_groups = info->blocks;
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_NDRANGE_0, 7);
+   tu_cs_emit(cs,
+              A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(3) |
+              A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
+              A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
+              A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
+   tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));
+   tu_cs_emit(cs, 0);            /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */
+   tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));
+   tu_cs_emit(cs, 0);            /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */
+   tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));
+   tu_cs_emit(cs, 0);            /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3);
+   tu_cs_emit(cs, 1);            /* HLSQ_CS_KERNEL_GROUP_X */
+   tu_cs_emit(cs, 1);            /* HLSQ_CS_KERNEL_GROUP_Y */
+   tu_cs_emit(cs, 1);            /* HLSQ_CS_KERNEL_GROUP_Z */
+
+   if (info->indirect) {
+      uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
+
+      tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
+                     MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+
+      tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
+      tu_cs_emit(cs, 0x00000000);
+      tu_cs_emit_qw(cs, iova);
+      tu_cs_emit(cs,
+                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
+                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
+                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
+   } else {
+      tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
+      tu_cs_emit(cs, 0x00000000);
+      tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
+      tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
+      tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
+   }
+
+   tu_cs_emit_wfi(cs);
+
+   tu6_emit_cache_flush(cmd, cs);
 }
 
 void
index 0e35d4e60dc5659deef9ed64a380208c1c40fba2..976bf60f81fd7e78f59bcd078375fc1b253343f5 100644 (file)
@@ -476,6 +476,52 @@ tu6_emit_fs_config(struct tu_cs *cs, const struct ir3_shader_variant *fs)
    tu_cs_emit(cs, fs->image_mapping.num_ibo);
 }
 
+static void
+tu6_emit_cs_config(struct tu_cs *cs, const struct ir3_shader_variant *v)
+{
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
+   tu_cs_emit(cs, 0xff);
+
+   unsigned constlen = align(v->constlen, 4);
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL, 1);
+   tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) |
+              A6XX_HLSQ_CS_CNTL_ENABLED);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2);
+   tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED |
+              A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo) |
+              A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |
+              A6XX_SP_CS_CONFIG_NSAMP(v->num_samp) |
+              A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo));
+   tu_cs_emit(cs, v->instrlen);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1);
+   tu_cs_emit(cs, A6XX_SP_CS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+              A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(v->info.max_reg + 1) |
+              A6XX_SP_CS_CTRL_REG0_MERGEDREGS |
+              A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) |
+              COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE));
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
+   tu_cs_emit(cs, 0x41);
+
+   uint32_t local_invocation_id =
+      ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
+   uint32_t work_group_id =
+      ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
+   tu_cs_emit(cs,
+              A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+              A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
+              A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
+              A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+   tu_cs_emit(cs, 0x2fc);             /* HLSQ_CS_UNKNOWN_B998 */
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1);
+   tu_cs_emit(cs, v->image_mapping.num_ibo);
+}
+
 static void
 tu6_emit_vs_system_values(struct tu_cs *cs,
                           const struct ir3_shader_variant *vs)
@@ -1441,13 +1487,12 @@ tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4])
 }
 
 static VkResult
-tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder,
-                                    struct tu_pipeline **out_pipeline)
+tu_pipeline_create(struct tu_device *dev,
+                   const VkAllocationCallbacks *pAllocator,
+                   struct tu_pipeline **out_pipeline)
 {
-   struct tu_device *dev = builder->device;
-
    struct tu_pipeline *pipeline =
-      vk_zalloc2(&dev->alloc, builder->alloc, sizeof(*pipeline), 8,
+      vk_zalloc2(&dev->alloc, pAllocator, sizeof(*pipeline), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!pipeline)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
@@ -1457,7 +1502,7 @@ tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder,
    /* reserve the space now such that tu_cs_begin_sub_stream never fails */
    VkResult result = tu_cs_reserve_space(dev, &pipeline->cs, 2048);
    if (result != VK_SUCCESS) {
-      vk_free2(&dev->alloc, builder->alloc, pipeline);
+      vk_free2(&dev->alloc, pAllocator, pipeline);
       return result;
    }
 
@@ -1813,7 +1858,8 @@ static VkResult
 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
                           struct tu_pipeline **pipeline)
 {
-   VkResult result = tu_pipeline_builder_create_pipeline(builder, pipeline);
+   VkResult result = tu_pipeline_create(builder->device, builder->alloc,
+                                        pipeline);
    if (result != VK_SUCCESS)
       return result;
 
@@ -1949,38 +1995,133 @@ tu_CreateGraphicsPipelines(VkDevice device,
    return final_result;
 }
 
+static void
+tu6_emit_compute_program(struct tu_cs *cs,
+                         struct tu_shader *shader,
+                         const struct tu_bo *binary_bo)
+{
+   const struct ir3_shader_variant *v = &shader->variants[0];
+
+   tu6_emit_cs_config(cs, v);
+
+   /* The compute program is the only one in the pipeline, so 0 offset. */
+   tu6_emit_shader_object(cs, MESA_SHADER_COMPUTE, v, binary_bo, 0);
+
+   tu6_emit_immediates(cs, v, CP_LOAD_STATE6_FRAG, SB6_CS_SHADER);
+}
+
 static VkResult
-tu_compute_pipeline_create(VkDevice _device,
+tu_compute_upload_shader(VkDevice device,
+                         struct tu_pipeline *pipeline,
+                         struct tu_shader *shader)
+{
+   TU_FROM_HANDLE(tu_device, dev, device);
+   struct tu_bo *bo = &pipeline->program.binary_bo;
+   struct ir3_shader_variant *v = &shader->variants[0];
+
+   uint32_t shader_size = sizeof(uint32_t) * v->info.sizedwords;
+   VkResult result =
+      tu_bo_init_new(dev, bo, shader_size);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = tu_bo_map(dev, bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(bo->map, shader->binary, shader_size);
+
+   return VK_SUCCESS;
+}
+
+
+static VkResult
+tu_compute_pipeline_create(VkDevice device,
                            VkPipelineCache _cache,
                            const VkComputePipelineCreateInfo *pCreateInfo,
                            const VkAllocationCallbacks *pAllocator,
                            VkPipeline *pPipeline)
 {
+   TU_FROM_HANDLE(tu_device, dev, device);
+   const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
+   VkResult result;
+
+   struct tu_pipeline *pipeline;
+
+   result = tu_pipeline_create(dev, pAllocator, &pipeline);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct tu_shader_compile_options options;
+   tu_shader_compile_options_init(&options, NULL);
+
+   struct tu_shader *shader =
+      tu_shader_create(dev, MESA_SHADER_COMPUTE, stage_info, pAllocator);
+   if (!shader) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
+   result = tu_shader_compile(dev, shader, NULL, &options, pAllocator);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct tu_program_descriptor_linkage *link = &pipeline->program.link[MESA_SHADER_COMPUTE];
+   struct ir3_shader_variant *v = &shader->variants[0];
+
+   link->ubo_state = v->shader->ubo_state;
+   link->const_state = v->shader->const_state;
+   link->constlen = v->constlen;
+   link->texture_map = shader->texture_map;
+   link->sampler_map = shader->sampler_map;
+   link->ubo_map = shader->ubo_map;
+   link->ssbo_map = shader->ssbo_map;
+   link->image_mapping =  v->image_mapping;
+
+   result = tu_compute_upload_shader(device, pipeline, shader);
+   if (result != VK_SUCCESS)
+      return result;
+
+   for (int i = 0; i < 3; i++)
+      pipeline->compute.local_size[i] = v->shader->nir->info.cs.local_size[i];
+
+   struct tu_cs prog_cs;
+   tu_cs_begin_sub_stream(dev, &pipeline->cs, 512, &prog_cs);
+   tu6_emit_compute_program(&prog_cs, shader, &pipeline->program.binary_bo);
+   pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs);
+
+   *pPipeline = tu_pipeline_to_handle(pipeline);
    return VK_SUCCESS;
+
+fail:
+   tu_shader_destroy(dev, shader, pAllocator);
+   if (result != VK_SUCCESS) {
+      tu_pipeline_finish(pipeline, dev, pAllocator);
+      vk_free2(&dev->alloc, pAllocator, pipeline);
+   }
+
+   return result;
 }
 
 VkResult
-tu_CreateComputePipelines(VkDevice _device,
+tu_CreateComputePipelines(VkDevice device,
                           VkPipelineCache pipelineCache,
                           uint32_t count,
                           const VkComputePipelineCreateInfo *pCreateInfos,
                           const VkAllocationCallbacks *pAllocator,
                           VkPipeline *pPipelines)
 {
-   VkResult result = VK_SUCCESS;
+   VkResult final_result = VK_SUCCESS;
 
-   unsigned i = 0;
-   for (; i < count; i++) {
-      VkResult r;
-      r = tu_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i],
-                                     pAllocator, &pPipelines[i]);
-      if (r != VK_SUCCESS) {
-         result = r;
-      }
-      pPipelines[i] = VK_NULL_HANDLE;
+   for (uint32_t i = 0; i < count; i++) {
+      VkResult result = tu_compute_pipeline_create(device, pipelineCache,
+                                                   &pCreateInfos[i],
+                                                   pAllocator, &pPipelines[i]);
+      if (result != VK_SUCCESS)
+         final_result = result;
    }
 
-   return result;
+   return final_result;
 }
 
 void
index 813ff0cc1ea6b1298faad81f527281ead1fe365c..2532feed14e3d3531695aa8edb714b6829940c11 100644 (file)
@@ -825,8 +825,9 @@ struct tu_tiling_config
 enum tu_cmd_dirty_bits
 {
    TU_CMD_DIRTY_PIPELINE = 1 << 0,
-   TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1,
-   TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 2,
+   TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1,
+   TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2,
+   TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3,
 
    TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
    TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
@@ -839,6 +840,7 @@ struct tu_cmd_state
    uint32_t dirty;
 
    struct tu_pipeline *pipeline;
+   struct tu_pipeline *compute_pipeline;
 
    /* Vertex buffers */
    struct
@@ -1167,6 +1169,11 @@ struct tu_pipeline
    {
       struct tu_cs_entry state_ib;
    } blend;
+
+   struct
+   {
+      uint32_t local_size[3];
+   } compute;
 };
 
 void