From: Samuel Pitoiset Date: Thu, 16 May 2019 09:55:02 +0000 (+0200) Subject: radv: implement VK_EXT_sample_locations and disable it X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=da26013eb7a216eea98b71ba6e8341a47834e3ec;p=mesa.git radv: implement VK_EXT_sample_locations and disable it Basically, this extension allows applications to use custom sample locations. It doesn't support variable sample locations during subpass. Note that we don't have to upload the user sample locations because the spec doesn't allow this. The extension is currently disabled because the driver needs to support variable sample locations during layout transitions. The depth decompress needs to know them and that's a bit invasive. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index e524c321832..b40113d543a 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -105,6 +105,7 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, dest->viewport.count = src->viewport.count; dest->scissor.count = src->scissor.count; dest->discard_rectangle.count = src->discard_rectangle.count; + dest->sample_location.count = src->sample_location.count; if (copy_mask & RADV_DYNAMIC_VIEWPORT) { if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, @@ -192,6 +193,22 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, } } + if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) { + if (dest->sample_location.per_pixel != src->sample_location.per_pixel || + dest->sample_location.grid_size.width != src->sample_location.grid_size.width || + dest->sample_location.grid_size.height != src->sample_location.grid_size.height || + memcmp(&dest->sample_location.locations, + &src->sample_location.locations, + src->sample_location.count * sizeof(VkSampleLocationEXT))) { + dest->sample_location.per_pixel = src->sample_location.per_pixel; + dest->sample_location.grid_size = src->sample_location.grid_size; + typed_memcpy(dest->sample_location.locations, + src->sample_location.locations, + src->sample_location.count); + dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS; + } + } + cmd_buffer->state.dirty |= dest_mask; } @@ -632,6 +649,190 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, } } +/** + * Convert the user sample locations to hardware sample locations (the values + * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*). + */ +static void +radv_convert_user_sample_locs(struct radv_sample_locations_state *state, + uint32_t x, uint32_t y, VkOffset2D *sample_locs) +{ + uint32_t x_offset = x % state->grid_size.width; + uint32_t y_offset = y % state->grid_size.height; + uint32_t num_samples = (uint32_t)state->per_pixel; + VkSampleLocationEXT *user_locs; + uint32_t pixel_offset; + + pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples; + + assert(pixel_offset <= MAX_SAMPLE_LOCATIONS); + user_locs = &state->locations[pixel_offset]; + + for (uint32_t i = 0; i < num_samples; i++) { + float shifted_pos_x = user_locs[i].x - 0.5; + float shifted_pos_y = user_locs[i].y - 0.5; + + int32_t scaled_pos_x = floor(shifted_pos_x * 16); + int32_t scaled_pos_y = floor(shifted_pos_y * 16); + + sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7); + sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7); + } +} + +/** + * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample + * locations. + */ +static void +radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, + uint32_t *sample_locs_pixel) +{ + for (uint32_t i = 0; i < num_samples; i++) { + uint32_t sample_reg_idx = i / 4; + uint32_t sample_loc_idx = i % 4; + int32_t pos_x = sample_locs[i].x; + int32_t pos_y = sample_locs[i].y; + + uint32_t shift_x = 8 * sample_loc_idx; + uint32_t shift_y = shift_x + 4; + + sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x; + sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y; + } +} + +/** + * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware + * sample locations. + */ +static uint64_t +radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, + VkOffset2D *sample_locs, + uint32_t num_samples) +{ + uint32_t centroid_priorities[num_samples]; + uint32_t sample_mask = num_samples - 1; + uint32_t distances[num_samples]; + uint64_t centroid_priority = 0; + + /* Compute the distances from center for each sample. */ + for (int i = 0; i < num_samples; i++) { + distances[i] = (sample_locs[i].x * sample_locs[i].x) + + (sample_locs[i].y * sample_locs[i].y); + } + + /* Compute the centroid priorities by looking at the distances array. */ + for (int i = 0; i < num_samples; i++) { + uint32_t min_idx = 0; + + for (int j = 1; j < num_samples; j++) { + if (distances[j] < distances[min_idx]) + min_idx = j; + } + + centroid_priorities[i] = min_idx; + distances[min_idx] = 0xffffffff; + } + + /* Compute the final centroid priority. */ + for (int i = 0; i < 8; i++) { + centroid_priority |= + centroid_priorities[i & sample_mask] << (i * 4); + } + + return centroid_priority << 32 | centroid_priority; +} + +/** + * Emit the sample locations that are specified with VK_EXT_sample_locations. + */ +static void +radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_multisample_state *ms = &pipeline->graphics.ms; + struct radv_sample_locations_state *sample_location = + &cmd_buffer->state.dynamic.sample_location; + uint32_t num_samples = (uint32_t)sample_location->per_pixel; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + uint32_t sample_locs_pixel[4][2] = {}; + VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */ + uint32_t max_sample_dist = 0; + uint64_t centroid_priority; + + if (!cmd_buffer->state.dynamic.sample_location.count) + return; + + /* Convert the user sample locations to hardware sample locations. */ + radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]); + radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]); + radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]); + radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]); + + /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */ + for (uint32_t i = 0; i < 4; i++) { + radv_compute_sample_locs_pixel(num_samples, sample_locs[i], + sample_locs_pixel[i]); + } + + /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */ + centroid_priority = + radv_compute_centroid_priority(cmd_buffer, sample_locs[0], + num_samples); + + /* Compute the maximum sample distance from the specified locations. */ + for (uint32_t i = 0; i < num_samples; i++) { + VkOffset2D offset = sample_locs[0][i]; + max_sample_dist = MAX2(max_sample_dist, + MAX2(abs(offset.x), abs(offset.y))); + } + + /* Emit the specified user sample locations. */ + switch (num_samples) { + case 2: + case 4: + radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]); + radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]); + radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]); + radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]); + break; + case 8: + radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]); + radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]); + radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]); + radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]); + radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]); + radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]); + radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]); + radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]); + break; + default: + unreachable("invalid number of samples"); + } + + /* Emit the maximum sample distance and the centroid priority. */ + uint32_t pa_sc_aa_config = ms->pa_sc_aa_config; + + pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST; + pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist); + + radeon_set_context_reg_seq(cs, R_028BE0_PA_SC_AA_CONFIG, 1); + radeon_emit(cs, pa_sc_aa_config); + + radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); + radeon_emit(cs, centroid_priority); + radeon_emit(cs, centroid_priority >> 32); + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (cmd_buffer->device->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } + + cmd_buffer->state.context_roll_without_scissor_emitted = true; +} + static void radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline, @@ -1775,6 +1976,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE) radv_emit_discard_rectangle(cmd_buffer); + if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) + radv_emit_sample_locations(cmd_buffer); + cmd_buffer->state.dirty &= ~states; } @@ -3272,6 +3476,25 @@ void radv_CmdSetDiscardRectangleEXT( state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE; } +void radv_CmdSetSampleLocationsEXT( + VkCommandBuffer commandBuffer, + const VkSampleLocationsInfoEXT* pSampleLocationsInfo) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; + + assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); + + state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; + state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; + state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount; + typed_memcpy(&state->dynamic.sample_location.locations[0], + pSampleLocationsInfo->pSampleLocations, + pSampleLocationsInfo->sampleLocationsCount); + + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS; +} + void radv_CmdExecuteCommands( VkCommandBuffer commandBuffer, uint32_t commandBufferCount, diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index c50908bcfcb..9e50038e22a 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1367,6 +1367,27 @@ void radv_GetPhysicalDeviceProperties2( props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: { + VkPhysicalDeviceSampleLocationsPropertiesEXT *properties = + (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext; + /* TODO: The ext is currently disabled because the + * driver needs to handle sample locations during + * layout transitions for depth/stencil surfaces and + * HTILE. + */ + properties->sampleLocationSampleCounts = VK_SAMPLE_COUNT_1_BIT; + /* + properties->sampleLocationSampleCounts = VK_SAMPLE_COUNT_2_BIT | + VK_SAMPLE_COUNT_4_BIT | + VK_SAMPLE_COUNT_8_BIT; + */ + properties->maxSampleLocationGridSize = (VkExtent2D){ 2 , 2 }; + properties->sampleLocationCoordinateRange[0] = 0.0f; + properties->sampleLocationCoordinateRange[1] = 0.9375f; + properties->sampleLocationSubPixelBits = 4; + properties->variableSampleLocations = VK_FALSE; + break; + } default: break; } @@ -5368,3 +5389,17 @@ VkResult radv_GetCalibratedTimestampsEXT( return VK_SUCCESS; } + +void radv_GetPhysicalDeviceMultisamplePropertiesEXT( + VkPhysicalDevice physicalDevice, + VkSampleCountFlagBits samples, + VkMultisamplePropertiesEXT* pMultisampleProperties) +{ + if (samples & (VK_SAMPLE_COUNT_2_BIT | + VK_SAMPLE_COUNT_4_BIT | + VK_SAMPLE_COUNT_8_BIT)) { + pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 2, 2 }; + } else { + pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 0, 0 }; + } +} diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index 0b5af56a435..d6e9d5c034b 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -119,6 +119,7 @@ EXTENSIONS = [ Extension('VK_EXT_memory_priority', 1, True), Extension('VK_EXT_pci_bus_info', 2, True), Extension('VK_EXT_pipeline_creation_feedback', 1, True), + Extension('VK_EXT_sample_locations', 1, False), Extension('VK_EXT_sampler_filter_minmax', 1, 'device->rad_info.chip_class >= GFX7'), Extension('VK_EXT_scalar_block_layout', 1, 'device->rad_info.chip_class >= GFX7'), Extension('VK_EXT_shader_viewport_index_layer', 1, True), diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c index 161997ae196..b69dda39ce1 100644 --- a/src/amd/vulkan/radv_image.c +++ b/src/amd/vulkan/radv_image.c @@ -76,6 +76,13 @@ radv_use_tc_compat_htile_for_image(struct radv_device *device, (pCreateInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)) return false; + /* TODO: Implement layout transitions with variable sample locations + * before enabling HTILE for depth/stencil images created with this + * flags because the depth decompress pass needs to know them. + */ + if (pCreateInfo->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT) + return false; + if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) return false; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index a3076d421a5..ddcbe465883 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1267,6 +1267,8 @@ static unsigned radv_dynamic_state_mask(VkDynamicState state) return RADV_DYNAMIC_STENCIL_REFERENCE; case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT: return RADV_DYNAMIC_DISCARD_RECTANGLE; + case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: + return RADV_DYNAMIC_SAMPLE_LOCATIONS; default: unreachable("Unhandled dynamic state"); } @@ -1297,6 +1299,11 @@ static uint32_t radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreat if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT)) states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE; + if (!pCreateInfo->pMultisampleState || + !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext, + PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT)) + states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS; + /* TODO: blend constants & line width. */ return states; @@ -1426,6 +1433,29 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline, } } + if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) { + const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info = + vk_find_struct_const(pCreateInfo->pMultisampleState->pNext, + PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); + /* If sampleLocationsEnable is VK_FALSE, the default sample + * locations are used and the values specified in + * sampleLocationsInfo are ignored. + */ + if (sample_location_info->sampleLocationsEnable) { + const VkSampleLocationsInfoEXT *pSampleLocationsInfo = + &sample_location_info->sampleLocationsInfo; + + assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); + + dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; + dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; + dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount; + typed_memcpy(&dynamic->sample_location.locations[0], + pSampleLocationsInfo->pSampleLocations, + pSampleLocationsInfo->sampleLocationsCount); + } + } + pipeline->dynamic_state.mask = states; } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 08b2621685d..828edb56540 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -91,6 +91,7 @@ typedef uint32_t xcb_window_t; #define MAX_VIEWPORTS 16 #define MAX_SCISSORS 16 #define MAX_DISCARD_RECTANGLES 4 +#define MAX_SAMPLE_LOCATIONS 32 #define MAX_PUSH_CONSTANTS_SIZE 128 #define MAX_PUSH_DESCRIPTORS 32 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16 @@ -852,7 +853,8 @@ enum radv_dynamic_state_bits { RADV_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, RADV_DYNAMIC_STENCIL_REFERENCE = 1 << 8, RADV_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, - RADV_DYNAMIC_ALL = (1 << 10) - 1, + RADV_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10, + RADV_DYNAMIC_ALL = (1 << 11) - 1, }; enum radv_cmd_dirty_bits { @@ -868,12 +870,13 @@ enum radv_cmd_dirty_bits { RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 8, RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, - RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 10) - 1, - RADV_CMD_DIRTY_PIPELINE = 1 << 10, - RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 11, - RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 12, - RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 13, - RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 14, + RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10, + RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 11) - 1, + RADV_CMD_DIRTY_PIPELINE = 1 << 11, + RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 12, + RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 13, + RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 14, + RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 15, }; enum radv_cmd_flush_bits { @@ -950,6 +953,13 @@ struct radv_discard_rectangle_state { VkRect2D rectangles[MAX_DISCARD_RECTANGLES]; }; +struct radv_sample_locations_state { + VkSampleCountFlagBits per_pixel; + VkExtent2D grid_size; + uint32_t count; + VkSampleLocationEXT locations[MAX_SAMPLE_LOCATIONS]; +}; + struct radv_dynamic_state { /** * Bitmask of (1 << VK_DYNAMIC_STATE_*). @@ -992,6 +1002,8 @@ struct radv_dynamic_state { } stencil_reference; struct radv_discard_rectangle_state discard_rectangle; + + struct radv_sample_locations_state sample_location; }; extern const struct radv_dynamic_state default_dynamic_state;