radv: Flush in the initial preamble CS.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Mon, 20 Feb 2017 08:26:00 +0000 (09:26 +0100)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 21 Feb 2017 08:19:58 +0000 (09:19 +0100)
Signed-off-by: Bas Nieuwenhuizen <basni@google.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/si_cmd_buffer.c

index 8b8e4ddfb0b1c7636381c15586d16641f02eb822..9b58e416e3b571687f46890de20ac91dcee3bfe7 100644 (file)
@@ -792,8 +792,10 @@ radv_queue_finish(struct radv_queue *queue)
        if (queue->hw_ctx)
                queue->device->ws->ctx_destroy(queue->hw_ctx);
 
-       if (queue->preamble_cs)
-               queue->device->ws->cs_destroy(queue->preamble_cs);
+       if (queue->initial_preamble_cs)
+               queue->device->ws->cs_destroy(queue->initial_preamble_cs);
+       if (queue->continue_preamble_cs)
+               queue->device->ws->cs_destroy(queue->continue_preamble_cs);
        if (queue->descriptor_bo)
                queue->device->ws->buffer_destroy(queue->descriptor_bo);
        if (queue->scratch_bo)
@@ -939,6 +941,21 @@ VkResult radv_CreateDevice(
                        break;
                }
                device->ws->cs_finalize(device->empty_cs[family]);
+
+               device->flush_cs[family] = device->ws->cs_create(device->ws, family);
+               switch (family) {
+               case RADV_QUEUE_GENERAL:
+               case RADV_QUEUE_COMPUTE:
+                       si_cs_emit_cache_flush(device->flush_cs[family],
+                                              device->physical_device->rad_info.chip_class,
+                                              family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
+                                              RADV_CMD_FLAG_INV_ICACHE |
+                                              RADV_CMD_FLAG_INV_SMEM_L1 |
+                                              RADV_CMD_FLAG_INV_VMEM_L1 |
+                                              RADV_CMD_FLAG_INV_GLOBAL_L2);
+                       break;
+               }
+               device->ws->cs_finalize(device->flush_cs[family]);
        }
 
        if (getenv("RADV_TRACE_FILE")) {
@@ -995,6 +1012,8 @@ void radv_DestroyDevice(
                        vk_free(&device->alloc, device->queues[i]);
                if (device->empty_cs[i])
                        device->ws->cs_destroy(device->empty_cs[i]);
+               if (device->flush_cs[i])
+                       device->ws->cs_destroy(device->flush_cs[i]);
        }
        radv_device_finish_meta(device);
 
@@ -1192,25 +1211,25 @@ radv_get_preamble_cs(struct radv_queue *queue,
                      uint32_t compute_scratch_size,
                     uint32_t esgs_ring_size,
                     uint32_t gsvs_ring_size,
-                     struct radeon_winsys_cs **preamble_cs)
+                     struct radeon_winsys_cs **initial_preamble_cs,
+                     struct radeon_winsys_cs **continue_preamble_cs)
 {
        struct radeon_winsys_bo *scratch_bo = NULL;
        struct radeon_winsys_bo *descriptor_bo = NULL;
        struct radeon_winsys_bo *compute_scratch_bo = NULL;
        struct radeon_winsys_bo *esgs_ring_bo = NULL;
        struct radeon_winsys_bo *gsvs_ring_bo = NULL;
-       struct radeon_winsys_cs *cs = NULL;
-
-       if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) {
-               *preamble_cs = NULL;
-               return VK_SUCCESS;
-       }
+       struct radeon_winsys_cs *dest_cs[2] = {0};
 
        if (scratch_size <= queue->scratch_size &&
            compute_scratch_size <= queue->compute_scratch_size &&
            esgs_ring_size <= queue->esgs_ring_size &&
-           gsvs_ring_size <= queue->gsvs_ring_size) {
-               *preamble_cs = queue->preamble_cs;
+           gsvs_ring_size <= queue->gsvs_ring_size &&
+           queue->initial_preamble_cs) {
+               *initial_preamble_cs = queue->initial_preamble_cs;
+               *continue_preamble_cs = queue->continue_preamble_cs;
+               if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
+                       *continue_preamble_cs = NULL;
                return VK_SUCCESS;
        }
 
@@ -1282,94 +1301,113 @@ radv_get_preamble_cs(struct radv_queue *queue,
        } else
                descriptor_bo = queue->descriptor_bo;
 
-       cs = queue->device->ws->cs_create(queue->device->ws,
-                                         queue->queue_family_index ? RING_COMPUTE : RING_GFX);
-       if (!cs)
-               goto fail;
+       for(int i = 0; i < 2; ++i) {
+               struct radeon_winsys_cs *cs = NULL;
+               cs = queue->device->ws->cs_create(queue->device->ws,
+                                                 queue->queue_family_index ? RING_COMPUTE : RING_GFX);
+               if (!cs)
+                       goto fail;
 
+               dest_cs[i] = cs;
 
-       if (scratch_bo)
-               queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
+               if (scratch_bo)
+                       queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
 
-       if (esgs_ring_bo)
-               queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
+               if (esgs_ring_bo)
+                       queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
 
-       if (gsvs_ring_bo)
-               queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
+               if (gsvs_ring_bo)
+                       queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
 
-       if (descriptor_bo)
-               queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
+               if (descriptor_bo)
+                       queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
 
-       if (descriptor_bo != queue->descriptor_bo) {
-               uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
+               if (descriptor_bo != queue->descriptor_bo) {
+                       uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
 
-               if (scratch_bo) {
-                       uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
-                       uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-                               S_008F04_SWIZZLE_ENABLE(1);
-                       map[0] = scratch_va;
-                       map[1] = rsrc1;
+                       if (scratch_bo) {
+                               uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
+                               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+                                                S_008F04_SWIZZLE_ENABLE(1);
+                               map[0] = scratch_va;
+                               map[1] = rsrc1;
+                       }
+
+                       if (esgs_ring_bo || gsvs_ring_bo)
+                               fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
+
+                       queue->device->ws->buffer_unmap(descriptor_bo);
                }
 
-               if (esgs_ring_bo || gsvs_ring_bo)
-                       fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
+               if (esgs_ring_bo || gsvs_ring_bo) {
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+                       radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+                       radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+
+                       if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+                               radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
+                               radeon_emit(cs, esgs_ring_size >> 8);
+                               radeon_emit(cs, gsvs_ring_size >> 8);
+                       } else {
+                               radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
+                               radeon_emit(cs, esgs_ring_size >> 8);
+                               radeon_emit(cs, gsvs_ring_size >> 8);
+                       }
+               }
 
-               queue->device->ws->buffer_unmap(descriptor_bo);
-       }
+               if (descriptor_bo) {
+                       uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
+                                          R_00B130_SPI_SHADER_USER_DATA_VS_0,
+                                          R_00B230_SPI_SHADER_USER_DATA_GS_0,
+                                          R_00B330_SPI_SHADER_USER_DATA_ES_0,
+                                          R_00B430_SPI_SHADER_USER_DATA_HS_0,
+                                          R_00B530_SPI_SHADER_USER_DATA_LS_0};
 
-       if (esgs_ring_bo || gsvs_ring_bo) {
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+                       uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo);
 
-               if (queue->device->physical_device->rad_info.chip_class >= CIK) {
-                       radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
-                       radeon_emit(cs, esgs_ring_size >> 8);
-                       radeon_emit(cs, gsvs_ring_size >> 8);
-               } else {
-                       radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
-                       radeon_emit(cs, esgs_ring_size >> 8);
-                       radeon_emit(cs, gsvs_ring_size >> 8);
+                       for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
+                               radeon_set_sh_reg_seq(cs, regs[i], 2);
+                               radeon_emit(cs, va);
+                               radeon_emit(cs, va >> 32);
+                       }
                }
-       }
 
-       if (descriptor_bo) {
-               uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
-                                  R_00B130_SPI_SHADER_USER_DATA_VS_0,
-                                  R_00B230_SPI_SHADER_USER_DATA_GS_0,
-                                  R_00B330_SPI_SHADER_USER_DATA_ES_0,
-                                  R_00B430_SPI_SHADER_USER_DATA_HS_0,
-                                  R_00B530_SPI_SHADER_USER_DATA_LS_0};
+               if (compute_scratch_bo) {
+                       uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo);
+                       uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+                                        S_008F04_SWIZZLE_ENABLE(1);
 
-               uint64_t va = queue->device->ws->buffer_get_va(descriptor_bo);
+                       queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8);
 
-               for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
-                       radeon_set_sh_reg_seq(cs, regs[i], 2);
-                       radeon_emit(cs, va);
-                       radeon_emit(cs, va >> 32);
+                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+                       radeon_emit(cs, scratch_va);
+                       radeon_emit(cs, rsrc1);
                }
-       }
 
-       if (compute_scratch_bo) {
-               uint64_t scratch_va = queue->device->ws->buffer_get_va(compute_scratch_bo);
-               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-                                S_008F04_SWIZZLE_ENABLE(1);
-
-               queue->device->ws->cs_add_buffer(cs, compute_scratch_bo, 8);
+               if (!i) {
+                       si_cs_emit_cache_flush(cs,
+                                              queue->device->physical_device->rad_info.chip_class,
+                                              queue->queue_family_index == RING_COMPUTE &&
+                                                queue->device->physical_device->rad_info.chip_class >= CIK,
+                                              RADV_CMD_FLAG_INV_ICACHE |
+                                              RADV_CMD_FLAG_INV_SMEM_L1 |
+                                              RADV_CMD_FLAG_INV_VMEM_L1 |
+                                              RADV_CMD_FLAG_INV_GLOBAL_L2);
+               }
 
-               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
-               radeon_emit(cs, scratch_va);
-               radeon_emit(cs, rsrc1);
+               if (!queue->device->ws->cs_finalize(cs))
+                       goto fail;
        }
 
-       if (!queue->device->ws->cs_finalize(cs))
-               goto fail;
+       if (queue->initial_preamble_cs)
+                       queue->device->ws->cs_destroy(queue->initial_preamble_cs);
 
-       if (queue->preamble_cs)
-               queue->device->ws->cs_destroy(queue->preamble_cs);
+       if (queue->continue_preamble_cs)
+                       queue->device->ws->cs_destroy(queue->continue_preamble_cs);
 
-       queue->preamble_cs = cs;
+       queue->initial_preamble_cs = dest_cs[0];
+       queue->continue_preamble_cs = dest_cs[1];
 
        if (scratch_bo != queue->scratch_bo) {
                if (queue->scratch_bo)
@@ -1406,11 +1444,15 @@ radv_get_preamble_cs(struct radv_queue *queue,
                queue->descriptor_bo = descriptor_bo;
        }
 
-       *preamble_cs = cs;
+       *initial_preamble_cs = queue->initial_preamble_cs;
+       *continue_preamble_cs = queue->continue_preamble_cs;
+       if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
+                       *continue_preamble_cs = NULL;
        return VK_SUCCESS;
 fail:
-       if (cs)
-               queue->device->ws->cs_destroy(cs);
+       for (int i = 0; i < ARRAY_SIZE(dest_cs); ++i)
+               if (dest_cs[i])
+                       queue->device->ws->cs_destroy(dest_cs[i]);
        if (descriptor_bo && descriptor_bo != queue->descriptor_bo)
                queue->device->ws->buffer_destroy(descriptor_bo);
        if (scratch_bo && scratch_bo != queue->scratch_bo)
@@ -1439,7 +1481,7 @@ VkResult radv_QueueSubmit(
        uint32_t scratch_size = 0;
        uint32_t compute_scratch_size = 0;
        uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
-       struct radeon_winsys_cs *preamble_cs = NULL;
+       struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL;
        VkResult result;
        bool fence_emitted = false;
 
@@ -1458,13 +1500,16 @@ VkResult radv_QueueSubmit(
                }
        }
 
-       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs);
+       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
+                                     esgs_ring_size, gsvs_ring_size,
+                                     &initial_preamble_cs, &continue_preamble_cs);
        if (result != VK_SUCCESS)
                return result;
 
        for (uint32_t i = 0; i < submitCount; i++) {
                struct radeon_winsys_cs **cs_array;
-               bool can_patch = true;
+               bool has_flush = !submitCount;
+               bool can_patch = !has_flush;
                uint32_t advance;
 
                if (!pSubmits[i].commandBufferCount) {
@@ -1487,29 +1532,32 @@ VkResult radv_QueueSubmit(
                }
 
                cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
-                                               pSubmits[i].commandBufferCount);
+                                               (pSubmits[i].commandBufferCount + has_flush));
+
+               if(has_flush)
+                       cs_array[0] = queue->device->flush_cs[queue->queue_family_index];
 
                for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
                        RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
                                         pSubmits[i].pCommandBuffers[j]);
                        assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-                       cs_array[j] = cmd_buffer->cs;
+                       cs_array[j + has_flush] = cmd_buffer->cs;
                        if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
                                can_patch = false;
                }
 
-               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
+               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount + has_flush; j += advance) {
                        advance = MIN2(max_cs_submission,
-                                      pSubmits[i].commandBufferCount - j);
+                                      pSubmits[i].commandBufferCount + has_flush - j);
                        bool b = j == 0;
-                       bool e = j + advance == pSubmits[i].commandBufferCount;
+                       bool e = j + advance == pSubmits[i].commandBufferCount + has_flush;
 
                        if (queue->device->trace_bo)
                                *queue->device->trace_id_ptr = 0;
 
                        ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
-                                                       advance, preamble_cs, preamble_cs,
+                                                       advance, initial_preamble_cs, continue_preamble_cs,
                                                        (struct radeon_winsys_sem **)pSubmits[i].pWaitSemaphores,
                                                        b ? pSubmits[i].waitSemaphoreCount : 0,
                                                        (struct radeon_winsys_sem **)pSubmits[i].pSignalSemaphores,
index 5fabc409d82a2452de506d03dabd7143310c4a1d..27e57e58538f4cea49ba0454d11a16195c49caa7 100644 (file)
@@ -479,7 +479,8 @@ struct radv_queue {
        struct radeon_winsys_bo *compute_scratch_bo;
        struct radeon_winsys_bo *esgs_ring_bo;
        struct radeon_winsys_bo *gsvs_ring_bo;
-       struct radeon_winsys_cs *preamble_cs;
+       struct radeon_winsys_cs *initial_preamble_cs;
+       struct radeon_winsys_cs *continue_preamble_cs;
 };
 
 struct radv_device {
@@ -495,6 +496,7 @@ struct radv_device {
        struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
        int queue_count[RADV_MAX_QUEUE_FAMILIES];
        struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];
+       struct radeon_winsys_cs *flush_cs[RADV_MAX_QUEUE_FAMILIES];
 
        uint64_t debug_flags;
 
@@ -764,6 +766,14 @@ void si_write_scissors(struct radeon_winsys_cs *cs, int first,
                       int count, const VkRect2D *scissors);
 uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
                                   bool instanced_or_indirect_draw, uint32_t draw_vertex_count);
+void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+                            enum chip_class chip_class,
+                            bool is_mec,
+                            enum radv_cmd_flush_bits flush_bits);
+void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+                            enum chip_class chip_class,
+                            bool is_mec,
+                            enum radv_cmd_flush_bits flush_bits);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
                           uint64_t src_va, uint64_t dest_va,
index 1091c7bb221a8362e118212ee96c42192c60ed34..4709ef69a027b0ebb523e6382138243565754806 100644 (file)
@@ -689,7 +689,7 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 
 }
 
-static void
+void
 si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                        enum chip_class chip_class,
                        bool is_mec,