From b4eb029062a944c428d6214447a852318e36016e Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 5 Oct 2018 18:04:56 +0200 Subject: [PATCH] radv: implement VK_EXT_transform_feedback This implementation should work and potential bugs can be fixed during the release candidates window anyway. Signed-off-by: Samuel Pitoiset Reviewed-by: Dave Airlie --- src/amd/common/sid.h | 1 + src/amd/vulkan/radv_cmd_buffer.c | 385 +++++++++++++++++++++++++++++- src/amd/vulkan/radv_device.c | 22 ++ src/amd/vulkan/radv_extensions.py | 1 + src/amd/vulkan/radv_pipeline.c | 19 ++ src/amd/vulkan/radv_private.h | 30 +++ src/amd/vulkan/radv_query.c | 111 ++++++++- src/amd/vulkan/radv_shader.c | 9 +- src/amd/vulkan/si_cmd_buffer.c | 6 + 9 files changed, 568 insertions(+), 16 deletions(-) diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h index 69b532177ac..d88ecf55806 100644 --- a/src/amd/common/sid.h +++ b/src/amd/common/sid.h @@ -119,6 +119,7 @@ #define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1 #define STRMOUT_OFFSET_FROM_MEM 2 #define STRMOUT_OFFSET_NONE 3 +#define STRMOUT_DATA_TYPE(x) (((unsigned)(x) & 0x1) << 7) #define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8) #define PKT3_DRAW_INDEX_OFFSET_2 0x35 #define PKT3_WRITE_DATA 0x37 diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index e21aaa9535d..63a1fd6dff3 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -196,6 +196,23 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= dest_mask; } +static void +radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline) +{ + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + struct radv_shader_info *info; + + if (!pipeline->streamout_shader) + return; + + info = &pipeline->streamout_shader->info.info; + for (int i = 0; i < MAX_SO_BUFFERS; i++) + so->stride_in_dw[i] = info->so.strides[i]; + + so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask; +} + bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer) { return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE && @@ -1875,10 +1892,94 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER; } +static void +radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_userdata_info *loc; + uint32_t base_reg; + + for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) { + if (!radv_get_shader(pipeline, stage)) + continue; + + loc = radv_lookup_user_sgpr(pipeline, stage, + AC_UD_STREAMOUT_BUFFERS); + if (loc->sgpr_idx == -1) + continue; + + base_reg = pipeline->user_data_0[stage]; + + radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, + base_reg + loc->sgpr_idx * 4, va, false); + } + + if (pipeline->gs_copy_shader) { + loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS]; + if (loc->sgpr_idx != -1) { + base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; + + radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, + base_reg + loc->sgpr_idx * 4, va, false); + } + } +} + +static void +radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) +{ + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) { + struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + unsigned so_offset; + void *so_ptr; + uint64_t va; + + /* Allocate some descriptor state for streamout buffers. */ + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, + MAX_SO_BUFFERS * 16, 256, + &so_offset, &so_ptr)) + return; + + for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) { + struct radv_buffer *buffer = sb[i].buffer; + uint32_t *desc = &((uint32_t *)so_ptr)[i * 4]; + + if (!(so->enabled_mask & (1 << i))) + continue; + + va = radv_buffer_get_va(buffer->bo) + buffer->offset; + + /* Set the descriptor. + * + * On VI, the format must be non-INVALID, otherwise + * the buffer will be considered not bound and store + * instructions will be no-ops. + */ + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); + desc[2] = 0xffffffff; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + va += so_offset; + + radv_emit_streamout_buffers(cmd_buffer, va); + } + + cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; +} + static void radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) { radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty); + radv_flush_streamout_descriptors(cmd_buffer); radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); } @@ -1969,7 +2070,8 @@ static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | - VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT)) { + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | + VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) { cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; } } @@ -1993,6 +2095,8 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, for_each_bit(b, src_flags) { switch ((VkAccessFlagBits)(1 << b)) { case VK_ACCESS_SHADER_WRITE_BIT: + case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: + case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2; break; case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: @@ -2062,6 +2166,7 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, switch ((VkAccessFlagBits)(1 << b)) { case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: case VK_ACCESS_INDEX_READ_BIT: + case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: break; case VK_ACCESS_UNIFORM_READ_BIT: flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1; @@ -2716,6 +2821,7 @@ void radv_CmdBindPipeline( cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS; radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state); + radv_bind_streamout_state(cmd_buffer, pipeline); if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed) cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size; @@ -3138,12 +3244,13 @@ static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned in static void radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, - uint32_t vertex_count) + uint32_t vertex_count, + bool use_opaque) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, vertex_count); radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | - S_0287F0_USE_OPAQUE(0)); + S_0287F0_USE_OPAQUE(use_opaque)); } static void @@ -3247,6 +3354,12 @@ struct radv_draw_info { */ struct radv_buffer *count_buffer; uint64_t count_buffer_offset; + + /** + * Stream output parameters resource. + */ + struct radv_buffer *strmout_buffer; + uint64_t strmout_buffer_offset; }; static void @@ -3257,6 +3370,27 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys *ws = cmd_buffer->device->ws; struct radeon_cmdbuf *cs = cmd_buffer->cs; + if (info->strmout_buffer) { + uint64_t va = radv_buffer_get_va(info->strmout_buffer->bo); + + va += info->strmout_buffer->offset + + info->strmout_buffer_offset; + + radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, + info->stride); + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG) | + COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); + radeon_emit(cs, 0); /* unused */ + + radv_cs_add_buffer(ws, cs, info->strmout_buffer->bo); + } + if (info->indirect) { uint64_t va = radv_buffer_get_va(info->indirect->bo); uint64_t count_va = 0; @@ -3341,14 +3475,17 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, } } else { if (!state->subpass->view_mask) { - radv_cs_emit_draw_packet(cmd_buffer, info->count); + radv_cs_emit_draw_packet(cmd_buffer, + info->count, + !!info->strmout_buffer); } else { unsigned i; for_each_bit(i, state->subpass->view_mask) { radv_emit_view_index(cmd_buffer, i); radv_cs_emit_draw_packet(cmd_buffer, - info->count); + info->count, + !!info->strmout_buffer); } } } @@ -3442,6 +3579,8 @@ static void radv_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) { + struct radeon_info *rad_info = + &cmd_buffer->device->physical_device->rad_info; bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= CIK; bool pipeline_is_dirty = @@ -3511,6 +3650,16 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer, } } + /* Workaround for a VGT hang when streamout is enabled. + * It must be done after drawing. + */ + if (cmd_buffer->state.streamout.streamout_enabled && + (rad_info->family == CHIP_HAWAII || + rad_info->family == CHIP_TONGA || + rad_info->family == CHIP_FIJI)) { + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC; + } + assert(cmd_buffer->cs->cdw <= cdw_max); radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH); } @@ -4486,3 +4635,229 @@ void radv_CmdEndConditionalRenderingEXT( cmd_buffer->state.predication_type = -1; cmd_buffer->state.predication_va = 0; } + +/* VK_EXT_transform_feedback */ +void radv_CmdBindTransformFeedbackBuffersEXT( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets, + const VkDeviceSize* pSizes) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; + uint8_t enabled_mask = 0; + + assert(firstBinding + bindingCount <= MAX_SO_BUFFERS); + for (uint32_t i = 0; i < bindingCount; i++) { + uint32_t idx = firstBinding + i; + + sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); + sb[idx].offset = pOffsets[i]; + sb[idx].size = pSizes[i]; + + radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, + sb[idx].buffer->bo); + + enabled_mask |= 1 << idx; + } + + cmd_buffer->state.streamout.enabled_mask = enabled_mask; + + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; +} + +static void +radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + + radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2); + radeon_emit(cs, + S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | + S_028B94_RAST_STREAM(0) | + S_028B94_STREAMOUT_1_EN(so->streamout_enabled) | + S_028B94_STREAMOUT_2_EN(so->streamout_enabled) | + S_028B94_STREAMOUT_3_EN(so->streamout_enabled)); + radeon_emit(cs, so->hw_enabled_mask & + so->enabled_stream_buffers_mask); +} + +static void +radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable) +{ + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + bool old_streamout_enabled = so->streamout_enabled; + uint32_t old_hw_enabled_mask = so->hw_enabled_mask; + + so->streamout_enabled = enable; + + so->hw_enabled_mask = so->enabled_mask | + (so->enabled_mask << 4) | + (so->enabled_mask << 8) | + (so->enabled_mask << 12); + + if ((old_streamout_enabled != so->streamout_enabled) || + (old_hw_enabled_mask != so->hw_enabled_mask)) + radv_emit_streamout_enable(cmd_buffer); +} + +static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) +{ + struct radeon_cmdbuf *cs = cmd_buffer->cs; + unsigned reg_strmout_cntl; + + /* The register is at different places on different ASICs. */ + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { + reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; + radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); + } else { + reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; + radeon_set_config_reg(cs, reg_strmout_cntl, 0); + } + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ + radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ + radeon_emit(cs, 0); + radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ + radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ + radeon_emit(cs, 4); /* poll interval */ +} + +void radv_CmdBeginTransformFeedbackEXT( + VkCommandBuffer commandBuffer, + uint32_t firstBuffer, + uint32_t bufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + + radv_flush_vgt_streamout(cmd_buffer); + + assert(firstBuffer + bufferCount <= MAX_SO_BUFFERS); + for (uint32_t i = firstBuffer; i < bufferCount; i++) { + if (!(so->enabled_mask & (1 << i))) + continue; + + /* SI binds streamout buffers as shader resources. + * VGT only counts primitives and tells the shader through + * SGPRs what to do. + */ + radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); + radeon_emit(cs, (sb[i].offset + sb[i].size) >> 2); /* BUFFER_SIZE (in DW) */ + radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */ + + if (pCounterBuffers && pCounterBuffers[i]) { + /* The array of counter buffers is optional. */ + RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[i]); + uint64_t va = radv_buffer_get_va(buffer->bo); + + va += buffer->offset + pCounterBufferOffsets[i]; + + /* Append */ + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_DATA_TYPE(1) | /* offset in bytes */ + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, va); /* src address lo */ + radeon_emit(cs, va >> 32); /* src address hi */ + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); + } else { + /* Start from the beginning. */ + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_DATA_TYPE(1) | /* offset in bytes */ + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, sb[i].offset >> 2); /* buffer offset in DW */ + radeon_emit(cs, 0); /* unused */ + } + } + + radv_set_streamout_enable(cmd_buffer, true); +} + +void radv_CmdEndTransformFeedbackEXT( + VkCommandBuffer commandBuffer, + uint32_t firstBuffer, + uint32_t bufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + + radv_flush_vgt_streamout(cmd_buffer); + + assert(firstBuffer + bufferCount <= MAX_SO_BUFFERS); + for (uint32_t i = firstBuffer; i < bufferCount; i++) { + if (!(so->enabled_mask & (1 << i))) + continue; + + if (pCounterBuffers && pCounterBuffers[i]) { + /* The array of counters buffer is optional. */ + RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[i]); + uint64_t va = radv_buffer_get_va(buffer->bo); + + va += buffer->offset + pCounterBufferOffsets[i]; + + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_DATA_TYPE(1) | /* offset in bytes */ + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | + STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ + radeon_emit(cs, va); /* dst address lo */ + radeon_emit(cs, va >> 32); /* dst address hi */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); + } + + /* Deactivate transform feedback by zeroing the buffer size. + * The counters (primitives generated, primitives emitted) may + * be enabled even if there is not buffer bound. This ensures + * that the primitives-emitted query won't increment. + */ + radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); + } + + radv_set_streamout_enable(cmd_buffer, false); +} + +void radv_CmdDrawIndirectByteCountEXT( + VkCommandBuffer commandBuffer, + uint32_t instanceCount, + uint32_t firstInstance, + VkBuffer _counterBuffer, + VkDeviceSize counterBufferOffset, + uint32_t counterOffset, + uint32_t vertexStride) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer); + struct radv_draw_info info = {}; + + info.instance_count = instanceCount; + info.first_instance = firstInstance; + info.strmout_buffer = counterBuffer; + info.strmout_buffer_offset = counterBufferOffset; + info.stride = vertexStride; + + radv_draw(cmd_buffer, &info); +} diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 1c6be300b46..508fbb27721 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -840,6 +840,13 @@ void radv_GetPhysicalDeviceFeatures2( features->vertexAttributeInstanceRateZeroDivisor = VK_TRUE; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: { + VkPhysicalDeviceTransformFeedbackFeaturesEXT *features = + (VkPhysicalDeviceTransformFeedbackFeaturesEXT*)ext; + features->transformFeedback = true; + features->geometryStreams = true; + break; + } default: break; } @@ -1213,6 +1220,21 @@ void radv_GetPhysicalDeviceProperties2( }; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { + VkPhysicalDeviceTransformFeedbackPropertiesEXT *properties = + (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext; + properties->maxTransformFeedbackStreams = MAX_SO_STREAMS; + properties->maxTransformFeedbackBuffers = MAX_SO_BUFFERS; + properties->maxTransformFeedbackBufferSize = UINT32_MAX; + properties->maxTransformFeedbackStreamDataSize = 512; + properties->maxTransformFeedbackBufferDataSize = UINT32_MAX; + properties->maxTransformFeedbackBufferDataStride = 512; + properties->transformFeedbackQueries = true; + properties->transformFeedbackStreamsLinesTriangles = false; + properties->transformFeedbackRasterizationStreamSelect = false; + properties->transformFeedbackDraw = true; + break; + } default: break; } diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index 0024bf0efe1..6bdf988d117 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -109,6 +109,7 @@ EXTENSIONS = [ Extension('VK_EXT_sampler_filter_minmax', 1, 'device->rad_info.chip_class >= CIK'), Extension('VK_EXT_shader_viewport_index_layer', 1, True), Extension('VK_EXT_shader_stencil_export', 1, True), + Extension('VK_EXT_transform_feedback', 1, True), Extension('VK_EXT_vertex_attribute_divisor', 3, True), Extension('VK_AMD_draw_indirect_count', 1, True), Extension('VK_AMD_gcn_shader', 1, True), diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index f69811559c1..4babb24a8eb 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3482,6 +3482,22 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline, } } +static struct radv_shader_variant * +radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline) +{ + int i; + + for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) { + struct radv_shader_variant *shader = + radv_get_shader(pipeline, i); + + if (shader && shader->info.info.so.num_outputs > 0) + return shader; + } + + return NULL; +} + static VkResult radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device, @@ -3597,6 +3613,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline, pipeline->graphics.vtx_emit_num = 2; } + /* Find the last vertex shader stage that eventually uses streamout. */ + pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline); + result = radv_pipeline_scratch_init(device, pipeline); radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, prim, gs_out); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 14ae0a44ff8..7e9e82e3158 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -843,6 +843,7 @@ enum radv_cmd_dirty_bits { RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 11, RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 12, RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 13, + RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 14, }; enum radv_cmd_flush_bits { @@ -868,6 +869,7 @@ enum radv_cmd_flush_bits { /* Pipeline query controls. */ RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 13, RADV_CMD_FLAG_STOP_PIPELINE_STATS = 1 << 14, + RADV_CMD_FLAG_VGT_STREAMOUT_SYNC = 1 << 15, RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER = (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | @@ -880,6 +882,29 @@ struct radv_vertex_binding { VkDeviceSize offset; }; +struct radv_streamout_binding { + struct radv_buffer *buffer; + VkDeviceSize offset; + VkDeviceSize size; +}; + +struct radv_streamout_state { + /* Mask of bound streamout buffers. */ + uint8_t enabled_mask; + + /* External state that comes from the last vertex stage, it must be + * set explicitely when binding a new graphics pipeline. + */ + uint16_t stride_in_dw[MAX_SO_BUFFERS]; + uint32_t enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */ + + /* State of VGT_STRMOUT_BUFFER_(CONFIG|END) */ + uint32_t hw_enabled_mask; + + /* State of VGT_STRMOUT_(CONFIG|EN) */ + bool streamout_enabled; +}; + struct radv_viewport_state { uint32_t count; VkViewport viewports[MAX_VIEWPORTS]; @@ -987,6 +1012,7 @@ struct radv_cmd_state { const struct radv_subpass * subpass; struct radv_dynamic_state dynamic; struct radv_attachment_state * attachments; + struct radv_streamout_state streamout; VkRect2D render_area; /* Index buffer */ @@ -1056,6 +1082,7 @@ struct radv_cmd_buffer { struct radeon_cmdbuf *cs; struct radv_cmd_state state; struct radv_vertex_binding vertex_bindings[MAX_VBS]; + struct radv_streamout_binding streamout_bindings[MAX_SO_BUFFERS]; uint32_t queue_family_index; uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; @@ -1353,6 +1380,9 @@ struct radv_pipeline { unsigned max_waves; unsigned scratch_bytes_per_wave; + + /* Not NULL if graphics pipeline uses streamout. */ + struct radv_shader_variant *streamout_shader; }; static inline bool radv_pipeline_has_gs(const struct radv_pipeline *pipeline) diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index 63a2b6a7d7c..5a326c9df54 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -789,6 +789,9 @@ VkResult radv_CreateQueryPool( case VK_QUERY_TYPE_TIMESTAMP: pool->stride = 8; break; + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + pool->stride = 32; + break; default: unreachable("creating unhandled query type"); } @@ -951,6 +954,44 @@ VkResult radv_GetQueryPoolResults( } break; } + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + volatile uint64_t const *src64 = (volatile uint64_t const *)src; + uint64_t num_primitives_written; + uint64_t primitive_storage_needed; + + /* SAMPLE_STREAMOUTSTATS stores this structure: + * { + * u64 NumPrimitivesWritten; + * u64 PrimitiveStorageNeeded; + * } + */ + available = 1; + for (int j = 0; j < 4; j++) { + if (!(src64[j] & 0x8000000000000000UL)) + available = 0; + } + + if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) { + result = VK_NOT_READY; + break; + } + + num_primitives_written = src64[3] - src64[1]; + primitive_storage_needed = src64[2] - src64[0]; + + if (flags & VK_QUERY_RESULT_64_BIT) { + *(uint64_t *)dest = num_primitives_written; + dest += 8; + *(uint64_t *)dest = primitive_storage_needed; + dest += 8; + } else { + *(uint32_t *)dest = num_primitives_written; + dest += 4; + *(uint32_t *)dest = primitive_storage_needed; + dest += 4; + } + break; + } default: unreachable("trying to get results of unhandled query type"); } @@ -1109,10 +1150,22 @@ void radv_CmdResetQueryPool( } } +static unsigned event_type_for_stream(unsigned stream) +{ + switch (stream) { + default: + case 0: return V_028A90_SAMPLE_STREAMOUTSTATS; + case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1; + case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2; + case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3; + } +} + static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkQueryType query_type, - VkQueryControlFlags flags) + VkQueryControlFlags flags, + uint32_t index) { struct radeon_cmdbuf *cs = cmd_buffer->cs; switch (query_type) { @@ -1161,6 +1214,16 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cs, va); radeon_emit(cs, va >> 32); break; + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + radeon_check_space(cmd_buffer->device->ws, cs, 4); + + assert(index < MAX_SO_STREAMS); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; default: unreachable("beginning unhandled query type"); } @@ -1169,7 +1232,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t avail_va, - VkQueryType query_type) + VkQueryType query_type, uint32_t index) { struct radeon_cmdbuf *cs = cmd_buffer->cs; switch (query_type) { @@ -1215,16 +1278,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, avail_va, 0, 1, cmd_buffer->gfx9_eop_bug_va); break; + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + radeon_check_space(cmd_buffer->device->ws, cs, 4); + + assert(index < MAX_SO_STREAMS); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); + radeon_emit(cs, (va + 16)); + radeon_emit(cs, (va + 16) >> 32); + break; default: unreachable("ending unhandled query type"); } } -void radv_CmdBeginQuery( +void radv_CmdBeginQueryIndexedEXT( VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, - VkQueryControlFlags flags) + VkQueryControlFlags flags, + uint32_t index) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); @@ -1247,14 +1321,23 @@ void radv_CmdBeginQuery( va += pool->stride * query; - emit_begin_query(cmd_buffer, va, pool->type, flags); + emit_begin_query(cmd_buffer, va, pool->type, flags, index); } +void radv_CmdBeginQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags) +{ + radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0); +} -void radv_CmdEndQuery( +void radv_CmdEndQueryIndexedEXT( VkCommandBuffer commandBuffer, VkQueryPool queryPool, - uint32_t query) + uint32_t query, + uint32_t index) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); @@ -1265,7 +1348,7 @@ void radv_CmdEndQuery( /* Do not need to add the pool BO to the list because the query must * currently be active, which means the BO is already in the list. */ - emit_end_query(cmd_buffer, va, avail_va, pool->type); + emit_end_query(cmd_buffer, va, avail_va, pool->type, index); /* * For multiview we have to emit a query for each bit in the mask, @@ -1282,12 +1365,20 @@ void radv_CmdEndQuery( for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { va += pool->stride; avail_va += 4; - emit_begin_query(cmd_buffer, va, pool->type, 0); - emit_end_query(cmd_buffer, va, avail_va, pool->type); + emit_begin_query(cmd_buffer, va, pool->type, 0, 0); + emit_end_query(cmd_buffer, va, avail_va, pool->type, 0); } } } +void radv_CmdEndQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query) +{ + radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0); +} + void radv_CmdWriteTimestamp( VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index edeaefbc1a2..f98ca6b4edd 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -243,6 +243,8 @@ radv_shader_compile_to_nir(struct radv_device *device, .runtime_descriptor_array = true, .stencil_export = true, .storage_16bit = true, + .geometry_streams = true, + .transform_feedback = true, }, }; entry_point = spirv_to_nir(spirv, module->size / 4, @@ -434,7 +436,12 @@ radv_fill_shader_variant(struct radv_device *device, variant->code_size = radv_get_shader_binary_size(binary); variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) | S_00B12C_USER_SGPR_MSB(variant->info.num_user_sgprs >> 5) | - S_00B12C_SCRATCH_EN(scratch_enabled); + S_00B12C_SCRATCH_EN(scratch_enabled) | + S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | + S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) | + S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | + S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) | + S_00B12C_SO_EN(!!info->so.num_outputs); variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) | S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) | diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 52daf994147..214bcead68c 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -883,6 +883,12 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); } + /* VGT streamout state sync */ + if (flush_bits & RADV_CMD_FLAG_VGT_STREAMOUT_SYNC) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); + } + /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ -- 2.30.2