radv: emit geometry ring size and pointers via preamble (v2)
authorDave Airlie <airlied@redhat.com>
Fri, 20 Jan 2017 01:06:52 +0000 (11:06 +1000)
committerDave Airlie <airlied@redhat.com>
Mon, 30 Jan 2017 23:30:19 +0000 (09:30 +1000)
This uses the scratch infrastructure to handle the esgs
and gsvs rings.

(this replaces the old code that did this with patching).

v2: fix correct ring sizes, reset sizes (Bas)

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Dave Airlie <airlied@redhat.com>
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_private.h

index 3d1542d4663ccf1c84726a2802f8cd5185d4c02d..9bc50ad0929c3cf44a3ab2026911782efbdebe4a 100644 (file)
@@ -1457,12 +1457,17 @@ static void  radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 
        cmd_buffer->scratch_size_needed = 0;
        cmd_buffer->compute_scratch_size_needed = 0;
+       cmd_buffer->esgs_ring_size_needed = 0;
+       cmd_buffer->gsvs_ring_size_needed = 0;
+
        if (cmd_buffer->upload.upload_bo)
                cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
                                                      cmd_buffer->upload.upload_bo, 8);
        cmd_buffer->upload.offset = 0;
 
        cmd_buffer->record_fail = false;
+
+       cmd_buffer->ring_offsets_idx = -1;
 }
 
 VkResult radv_ResetCommandBuffer(
@@ -1649,6 +1654,7 @@ VkResult radv_EndCommandBuffer(
 
        if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
                si_emit_cache_flush(cmd_buffer);
+
        if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
            cmd_buffer->record_fail)
                return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -1735,6 +1741,20 @@ void radv_CmdBindPipeline(
                radv_dynamic_state_copy(&cmd_buffer->state.dynamic,
                                        &pipeline->dynamic_state,
                                        pipeline->dynamic_state_mask);
+
+               if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
+                       cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
+               if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
+                       cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
+
+               if (radv_pipeline_has_gs(pipeline)) {
+                       struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
+                                                                            AC_UD_SCRATCH_RING_OFFSETS);
+                       if (cmd_buffer->ring_offsets_idx == -1)
+                               cmd_buffer->ring_offsets_idx = loc->sgpr_idx;
+                       else if (loc->sgpr_idx != -1)
+                               assert(loc->sgpr_idx != cmd_buffer->ring_offsets_idx);
+               }
                break;
        default:
                assert(!"invalid bind point");
@@ -1887,6 +1907,17 @@ void radv_CmdExecuteCommands(
                primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
                                                            secondary->compute_scratch_size_needed);
 
+               if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
+                       primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
+               if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
+                       primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
+
+               if (secondary->ring_offsets_idx != -1) {
+                       if (primary->ring_offsets_idx == -1)
+                               primary->ring_offsets_idx = secondary->ring_offsets_idx;
+                       else
+                               assert(secondary->ring_offsets_idx == primary->ring_offsets_idx);
+               }
                primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
        }
 
index b7978bb16a23aff40caadec8b1d98e23fec89f9e..984bd75bb80ffdccf3f986deda9d4369f2dd1fc1 100644 (file)
@@ -764,6 +764,10 @@ radv_queue_finish(struct radv_queue *queue)
                queue->device->ws->buffer_destroy(queue->descriptor_bo);
        if (queue->scratch_bo)
                queue->device->ws->buffer_destroy(queue->scratch_bo);
+       if (queue->esgs_ring_bo)
+               queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
+       if (queue->gsvs_ring_bo)
+               queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
        if (queue->compute_scratch_bo)
                queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
 }
@@ -1046,24 +1050,118 @@ static void radv_dump_trace(struct radv_device *device,
        fclose(f);
 }
 
+static void
+fill_geom_rings(struct radv_queue *queue,
+               uint32_t *map,
+               uint32_t esgs_ring_size,
+               struct radeon_winsys_bo *esgs_ring_bo,
+               uint32_t gsvs_ring_size,
+               struct radeon_winsys_bo *gsvs_ring_bo)
+{
+       uint64_t esgs_va, gsvs_va;
+       esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
+       gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
+       uint32_t *desc = &map[4];
+
+       /* stride 0, num records - size, add tid, swizzle, elsize4,
+          index stride 64 */
+       desc[0] = esgs_va;
+       desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) |
+               S_008F04_STRIDE(0) |
+               S_008F04_SWIZZLE_ENABLE(true);
+       desc[2] = esgs_ring_size;
+       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+               S_008F0C_ELEMENT_SIZE(1) |
+               S_008F0C_INDEX_STRIDE(3) |
+               S_008F0C_ADD_TID_ENABLE(true);
+
+       desc += 4;
+       /* GS entry for ES->GS ring */
+       /* stride 0, num records - size, elsize0,
+          index stride 0 */
+       desc[0] = esgs_va;
+       desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)|
+               S_008F04_STRIDE(0) |
+               S_008F04_SWIZZLE_ENABLE(false);
+       desc[2] = esgs_ring_size;
+       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+               S_008F0C_ELEMENT_SIZE(0) |
+               S_008F0C_INDEX_STRIDE(0) |
+               S_008F0C_ADD_TID_ENABLE(false);
+
+       desc += 4;
+       /* VS entry for GS->VS ring */
+       /* stride 0, num records - size, elsize0,
+          index stride 0 */
+       desc[0] = gsvs_va;
+       desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
+               S_008F04_STRIDE(0) |
+               S_008F04_SWIZZLE_ENABLE(false);
+       desc[2] = gsvs_ring_size;
+       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+               S_008F0C_ELEMENT_SIZE(0) |
+               S_008F0C_INDEX_STRIDE(0) |
+               S_008F0C_ADD_TID_ENABLE(false);
+       desc += 4;
+
+       /* stride gsvs_itemsize, num records 64
+          elsize 4, index stride 16 */
+       /* shader will patch stride and desc[2] */
+       desc[0] = gsvs_va;
+       desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
+               S_008F04_STRIDE(0) |
+               S_008F04_SWIZZLE_ENABLE(true);
+       desc[2] = 0;
+       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+               S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+               S_008F0C_ELEMENT_SIZE(1) |
+               S_008F0C_INDEX_STRIDE(1) |
+               S_008F0C_ADD_TID_ENABLE(true);
+}
+
 static VkResult
 radv_get_preamble_cs(struct radv_queue *queue,
                      uint32_t scratch_size,
                      uint32_t compute_scratch_size,
+                    uint32_t esgs_ring_size,
+                    uint32_t gsvs_ring_size,
                      struct radeon_winsys_cs **preamble_cs)
 {
        struct radeon_winsys_bo *scratch_bo = NULL;
        struct radeon_winsys_bo *descriptor_bo = NULL;
        struct radeon_winsys_bo *compute_scratch_bo = NULL;
+       struct radeon_winsys_bo *esgs_ring_bo = NULL;
+       struct radeon_winsys_bo *gsvs_ring_bo = NULL;
        struct radeon_winsys_cs *cs = NULL;
 
-       if (!scratch_size && !compute_scratch_size) {
+       if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) {
                *preamble_cs = NULL;
                return VK_SUCCESS;
        }
 
        if (scratch_size <= queue->scratch_size &&
-           compute_scratch_size <= queue->compute_scratch_size) {
+           compute_scratch_size <= queue->compute_scratch_size &&
+           esgs_ring_size <= queue->esgs_ring_size &&
+           gsvs_ring_size <= queue->gsvs_ring_size) {
                *preamble_cs = queue->preamble_cs;
                return VK_SUCCESS;
        }
@@ -1091,9 +1189,43 @@ radv_get_preamble_cs(struct radv_queue *queue,
        } else
                compute_scratch_bo = queue->compute_scratch_bo;
 
-       if (scratch_bo != queue->scratch_bo) {
+       if (esgs_ring_size > queue->esgs_ring_size) {
+               esgs_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
+                                                               esgs_ring_size,
+                                                               4096,
+                                                               RADEON_DOMAIN_VRAM,
+                                                               RADEON_FLAG_NO_CPU_ACCESS);
+               if (!esgs_ring_bo)
+                       goto fail;
+       } else {
+               esgs_ring_bo = queue->esgs_ring_bo;
+               esgs_ring_size = queue->esgs_ring_size;
+       }
+
+       if (gsvs_ring_size > queue->gsvs_ring_size) {
+               gsvs_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
+                                                               gsvs_ring_size,
+                                                               4096,
+                                                               RADEON_DOMAIN_VRAM,
+                                                               RADEON_FLAG_NO_CPU_ACCESS);
+               if (!gsvs_ring_bo)
+                       goto fail;
+       } else {
+               gsvs_ring_bo = queue->gsvs_ring_bo;
+               gsvs_ring_size = queue->gsvs_ring_size;
+       }
+
+       if (scratch_bo != queue->scratch_bo ||
+           esgs_ring_bo != queue->esgs_ring_bo ||
+           gsvs_ring_bo != queue->gsvs_ring_bo) {
+               uint32_t size = 0;
+               if (gsvs_ring_bo || esgs_ring_bo)
+                       size = 80; /* 2 dword + 2 padding + 4 dword * 4 */
+               else if (scratch_bo)
+                       size = 8; /* 2 dword */
+
                descriptor_bo = queue->device->ws->buffer_create(queue->device->ws,
-                                                                8,
+                                                                size,
                                                                 4096,
                                                                 RADEON_DOMAIN_VRAM,
                                                                 RADEON_FLAG_CPU_ACCESS);
@@ -1111,22 +1243,49 @@ radv_get_preamble_cs(struct radv_queue *queue,
        if (scratch_bo)
                queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
 
+       if (esgs_ring_bo)
+               queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
+
+       if (gsvs_ring_bo)
+               queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
+
        if (descriptor_bo)
                queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
 
        if (descriptor_bo != queue->descriptor_bo) {
-               uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
-               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-                                S_008F04_SWIZZLE_ENABLE(1);
-
                uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
 
-               map[0] = scratch_va;
-               map[1] = rsrc1;
+               if (scratch_bo) {
+                       uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
+                       uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+                               S_008F04_SWIZZLE_ENABLE(1);
+                       map[0] = scratch_va;
+                       map[1] = rsrc1;
+               }
+
+               if (esgs_ring_bo || gsvs_ring_bo)
+                       fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
 
                queue->device->ws->buffer_unmap(descriptor_bo);
        }
 
+       if (esgs_ring_bo || gsvs_ring_bo) {
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+
+               if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+                       radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
+                       radeon_emit(cs, esgs_ring_size >> 8);
+                       radeon_emit(cs, gsvs_ring_size >> 8);
+               } else {
+                       radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
+                       radeon_emit(cs, esgs_ring_size >> 8);
+                       radeon_emit(cs, gsvs_ring_size >> 8);
+               }
+       }
+
        if (descriptor_bo) {
                uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
                                   R_00B130_SPI_SHADER_USER_DATA_VS_0,
@@ -1178,6 +1337,20 @@ radv_get_preamble_cs(struct radv_queue *queue,
                queue->compute_scratch_size = compute_scratch_size;
        }
 
+       if (esgs_ring_bo != queue->esgs_ring_bo) {
+               if (queue->esgs_ring_bo)
+                       queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
+               queue->esgs_ring_bo = esgs_ring_bo;
+               queue->esgs_ring_size = esgs_ring_size;
+       }
+
+       if (gsvs_ring_bo != queue->gsvs_ring_bo) {
+               if (queue->gsvs_ring_bo)
+                       queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
+               queue->gsvs_ring_bo = gsvs_ring_bo;
+               queue->gsvs_ring_size = gsvs_ring_size;
+       }
+
        if (descriptor_bo != queue->descriptor_bo) {
                if (queue->descriptor_bo)
                        queue->device->ws->buffer_destroy(queue->descriptor_bo);
@@ -1196,6 +1369,10 @@ fail:
                queue->device->ws->buffer_destroy(scratch_bo);
        if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo)
                queue->device->ws->buffer_destroy(compute_scratch_bo);
+       if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo)
+               queue->device->ws->buffer_destroy(esgs_ring_bo);
+       if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
+               queue->device->ws->buffer_destroy(gsvs_ring_bo);
        return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 }
 
@@ -1213,6 +1390,7 @@ VkResult radv_QueueSubmit(
        uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX;
        uint32_t scratch_size = 0;
        uint32_t compute_scratch_size = 0;
+       uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
        struct radeon_winsys_cs *preamble_cs = NULL;
        VkResult result;
 
@@ -1226,10 +1404,12 @@ VkResult radv_QueueSubmit(
                        scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
                        compute_scratch_size = MAX2(compute_scratch_size,
                                                    cmd_buffer->compute_scratch_size_needed);
+                       esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
+                       gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
                }
        }
 
-       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, &preamble_cs);
+       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs);
        if (result != VK_SUCCESS)
                return result;
 
index 3a0318b9fc2c797ddd926a1feea3a3cf3cb93aa4..57aa9ead9b75e1f7e170a336a8a598bb29f0546c 100644 (file)
@@ -470,10 +470,14 @@ struct radv_queue {
 
        uint32_t scratch_size;
        uint32_t compute_scratch_size;
+       uint32_t esgs_ring_size;
+       uint32_t gsvs_ring_size;
 
        struct radeon_winsys_bo *scratch_bo;
        struct radeon_winsys_bo *descriptor_bo;
        struct radeon_winsys_bo *compute_scratch_bo;
+       struct radeon_winsys_bo *esgs_ring_bo;
+       struct radeon_winsys_bo *gsvs_ring_bo;
        struct radeon_winsys_cs *preamble_cs;
 };
 
@@ -742,6 +746,10 @@ struct radv_cmd_buffer {
 
        uint32_t scratch_size_needed;
        uint32_t compute_scratch_size_needed;
+       uint32_t esgs_ring_size_needed;
+       uint32_t gsvs_ring_size_needed;
+
+       int ring_offsets_idx; /* just used for verification */
 };
 
 struct radv_image;