X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=ebf54fd9f1802748aef49c1c3d3c3d513bd10156;hb=f9d7d27d6dc46696c0c8479a3180c57774991129;hp=045cb9d05cb824befb4fbb373760d2c6bae6c130;hpb=8b61c57049ff75766715ad4f7b1ad2d3657b9b4d;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 045cb9d05cb..ebf54fd9f18 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -25,6 +25,7 @@ #include #include "anv_private.h" +#include "vk_format_info.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" @@ -54,8 +55,6 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; -/* XXX: Do we need this on more than just BDW? */ -#if (GEN_GEN >= 8) /* Emit a render target cache flush. * * This isn't documented anywhere in the PRM. However, it seems to be @@ -64,9 +63,10 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) * clear depth, reset state base address, and then go render stuff. */ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DCFlushEnable = true; pc.RenderTargetCacheFlushEnable = true; + pc.CommandStreamerStallEnable = true; } -#endif anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; @@ -147,6 +147,440 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) */ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.TextureCacheInvalidationEnable = true; + pc.ConstantCacheInvalidationEnable = true; + pc.StateCacheInvalidationEnable = true; + } +} + +static void +add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer, + struct anv_state state, + struct anv_bo *bo, uint32_t offset) +{ + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + + anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, + state.offset + isl_dev->ss.addr_offset, bo, offset); +} + +static void +add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image_view *iview, + enum isl_aux_usage aux_usage, + struct anv_state state) +{ + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + + anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, + state.offset + isl_dev->ss.addr_offset, + iview->bo, iview->offset); + + if (aux_usage != ISL_AUX_USAGE_NONE) { + uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset; + + /* On gen7 and prior, the bottom 12 bits of the MCS base address are + * used to store other information. This should be ok, however, because + * surface buffer addresses are always 4K page alinged. + */ + assert((aux_offset & 0xfff) == 0); + uint32_t *aux_addr_dw = state.map + isl_dev->ss.aux_addr_offset; + aux_offset += *aux_addr_dw & 0xfff; + + anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, + state.offset + isl_dev->ss.aux_addr_offset, + iview->bo, aux_offset); + } +} + +static bool +color_is_zero_one(VkClearColorValue value, enum isl_format format) +{ + if (isl_format_has_int_channel(format)) { + for (unsigned i = 0; i < 4; i++) { + if (value.int32[i] != 0 && value.int32[i] != 1) + return false; + } + } else { + for (unsigned i = 0; i < 4; i++) { + if (value.float32[i] != 0.0f && value.float32[i] != 1.0f) + return false; + } + } + + return true; +} + +static void +color_attachment_compute_aux_usage(struct anv_device *device, + struct anv_attachment_state *att_state, + struct anv_image_view *iview, + VkRect2D render_area, + union isl_color_value *fast_clear_color) +{ + if (iview->image->aux_surface.isl.size == 0) { + att_state->aux_usage = ISL_AUX_USAGE_NONE; + att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + att_state->fast_clear = false; + return; + } + + assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT); + + att_state->clear_color_is_zero_one = + color_is_zero_one(att_state->clear_value.color, iview->isl.format); + + if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + /* Start off assuming fast clears are possible */ + att_state->fast_clear = true; + + /* Potentially, we could do partial fast-clears but doing so has crazy + * alignment restrictions. It's easier to just restrict to full size + * fast clears for now. + */ + if (render_area.offset.x != 0 || + render_area.offset.y != 0 || + render_area.extent.width != iview->extent.width || + render_area.extent.height != iview->extent.height) + att_state->fast_clear = false; + + if (GEN_GEN <= 7) { + /* On gen7, we can't do multi-LOD or multi-layer fast-clears. We + * technically can, but it comes with crazy restrictions that we + * don't want to deal with now. + */ + if (iview->isl.base_level > 0 || + iview->isl.base_array_layer > 0 || + iview->isl.array_len > 1) + att_state->fast_clear = false; + } + + /* On Broadwell and earlier, we can only handle 0/1 clear colors */ + if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one) + att_state->fast_clear = false; + + if (att_state->fast_clear) { + memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32, + sizeof(fast_clear_color->u32)); + } + } else { + att_state->fast_clear = false; + } + + /** + * TODO: Consider using a heuristic to determine if temporarily enabling + * CCS_E for this image view would be beneficial. + * + * While fast-clear resolves and partial resolves are fairly cheap in the + * case where you render to most of the pixels, full resolves are not + * because they potentially involve reading and writing the entire + * framebuffer. If we can't texture with CCS_E, we should leave it off and + * limit ourselves to fast clears. + */ + if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E) { + att_state->aux_usage = ISL_AUX_USAGE_CCS_E; + att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E; + } else if (att_state->fast_clear) { + att_state->aux_usage = ISL_AUX_USAGE_CCS_D; + if (GEN_GEN >= 9 && + !isl_format_supports_ccs_e(&device->info, iview->isl.format)) { + /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: + * + * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D + * setting is only allowed if Surface Format supported for Fast + * Clear. In addition, if the surface is bound to the sampling + * engine, Surface Format must be supported for Render Target + * Compression for surfaces bound to the sampling engine." + * + * In other words, we can't sample from a fast-cleared image if it + * doesn't also support color compression. + */ + att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + } else if (GEN_GEN == 8) { + /* Broadwell can sample from fast-cleared images */ + att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; + } else { + /* Ivy Bridge and Haswell cannot */ + att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + } + } else { + att_state->aux_usage = ISL_AUX_USAGE_NONE; + att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + } +} + +static bool +need_input_attachment_state(const struct anv_render_pass_attachment *att) +{ + if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) + return false; + + /* We only allocate input attachment states for color surfaces. Compression + * is not yet enabled for depth textures and stencil doesn't allow + * compression so we can just use the texture surface state from the view. + */ + return vk_format_is_color(att->format); +} + +static enum isl_aux_usage +layout_to_hiz_usage(VkImageLayout layout, uint8_t samples) +{ + switch (layout) { + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return ISL_AUX_USAGE_HIZ; + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL: + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + if (anv_can_sample_with_hiz(GEN_GEN, samples)) + return ISL_AUX_USAGE_HIZ; + /* Fall-through */ + case VK_IMAGE_LAYOUT_GENERAL: + /* This buffer could be used as a source or destination in a transfer + * operation. Transfer operations current don't perform HiZ-enabled reads + * and writes. + */ + default: + return ISL_AUX_USAGE_NONE; + } +} + +/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless + * the initial layout is undefined, the HiZ buffer and depth buffer will + * represent the same data at the end of this operation. + */ +static void +transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageLayout initial_layout, + VkImageLayout final_layout) +{ + assert(image); + + if (image->aux_usage != ISL_AUX_USAGE_HIZ || final_layout == initial_layout) + return; + + const bool hiz_enabled = layout_to_hiz_usage(initial_layout, image->samples) == + ISL_AUX_USAGE_HIZ; + const bool enable_hiz = layout_to_hiz_usage(final_layout, image->samples) == + ISL_AUX_USAGE_HIZ; + + enum blorp_hiz_op hiz_op; + if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED) { + /* We've already initialized the aux HiZ buffer at BindImageMemory time, + * so there's no need to perform a HIZ resolve or clear to avoid GPU hangs. + * This initial layout indicates that the user doesn't care about the data + * that's currently in the buffer, so resolves are not necessary except + * for the special case noted below. + */ + hiz_op = BLORP_HIZ_OP_NONE; + } else if (hiz_enabled && !enable_hiz) { + hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE; + } else if (!hiz_enabled && enable_hiz) { + hiz_op = BLORP_HIZ_OP_HIZ_RESOLVE; + } else { + assert(hiz_enabled == enable_hiz); + /* If the same buffer will be used, no resolves are necessary except for + * the special case noted below. + */ + hiz_op = BLORP_HIZ_OP_NONE; + } + + if (hiz_op != BLORP_HIZ_OP_NONE) + anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op); + + /* Images that have sampling with HiZ enabled cause all shader sampling to + * load data with the HiZ buffer. Therefore, in the case of transitioning to + * the general layout - which currently routes all writes to the depth + * buffer - we must ensure that the HiZ buffer remains consistent with the + * depth buffer by performing an additional HIZ resolve if the operation + * required by this transition was not already a HiZ resolve. + */ + if (final_layout == VK_IMAGE_LAYOUT_GENERAL && + anv_can_sample_with_hiz(GEN_GEN, image->samples) && + hiz_op != BLORP_HIZ_OP_HIZ_RESOLVE) { + anv_gen8_hiz_op_resolve(cmd_buffer, image, BLORP_HIZ_OP_HIZ_RESOLVE); + } +} + + +/** + * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. + */ +static void +genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, + struct anv_render_pass *pass, + const VkRenderPassBeginInfo *begin) +{ + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + struct anv_cmd_state *state = &cmd_buffer->state; + + vk_free(&cmd_buffer->pool->alloc, state->attachments); + + if (pass->attachment_count == 0) { + state->attachments = NULL; + return; + } + + state->attachments = vk_alloc(&cmd_buffer->pool->alloc, + pass->attachment_count * + sizeof(state->attachments[0]), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (state->attachments == NULL) { + /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ + abort(); + } + + bool need_null_state = false; + unsigned num_states = 0; + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + if (vk_format_is_color(pass->attachments[i].format)) { + num_states++; + } else { + /* We need a null state for any depth-stencil-only subpasses. + * Importantly, this includes depth/stencil clears so we create one + * whenever we have depth or stencil + */ + need_null_state = true; + } + + if (need_input_attachment_state(&pass->attachments[i])) + num_states++; + } + num_states += need_null_state; + + const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); + state->render_pass_states = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + num_states * ss_stride, isl_dev->ss.align); + + struct anv_state next_state = state->render_pass_states; + next_state.alloc_size = isl_dev->ss.size; + + if (need_null_state) { + state->null_surface_state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; + } + + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + if (vk_format_is_color(pass->attachments[i].format)) { + state->attachments[i].color_rt_state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; + } + + if (need_input_attachment_state(&pass->attachments[i])) { + state->attachments[i].input_att_state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; + } + } + assert(next_state.offset == state->render_pass_states.offset + + state->render_pass_states.alloc_size); + + if (begin) { + ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); + assert(pass->attachment_count == framebuffer->attachment_count); + + if (need_null_state) { + struct GENX(RENDER_SURFACE_STATE) null_ss = { + .SurfaceType = SURFTYPE_NULL, + .SurfaceArray = framebuffer->layers > 0, + .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM, +#if GEN_GEN >= 8 + .TileMode = YMAJOR, +#else + .TiledSurface = true, +#endif + .Width = framebuffer->width - 1, + .Height = framebuffer->height - 1, + .Depth = framebuffer->layers - 1, + .RenderTargetViewExtent = framebuffer->layers - 1, + }; + GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map, + &null_ss); + } + + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + struct anv_render_pass_attachment *att = &pass->attachments[i]; + VkImageAspectFlags att_aspects = vk_format_aspects(att->format); + VkImageAspectFlags clear_aspects = 0; + + if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + /* color attachment */ + if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; + } + } else { + /* depthstencil attachment */ + if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && + att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + } + if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + } + } + + state->attachments[i].current_layout = att->initial_layout; + state->attachments[i].pending_clear_aspects = clear_aspects; + if (clear_aspects) + state->attachments[i].clear_value = begin->pClearValues[i]; + + struct anv_image_view *iview = framebuffer->attachments[i]; + assert(iview->vk_format == att->format); + + union isl_color_value clear_color = { .u32 = { 0, } }; + if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + color_attachment_compute_aux_usage(cmd_buffer->device, + &state->attachments[i], + iview, begin->renderArea, + &clear_color); + + struct isl_view view = iview->isl; + view.usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT; + view.swizzle = anv_swizzle_for_render(view.swizzle); + isl_surf_fill_state(isl_dev, + state->attachments[i].color_rt_state.map, + .surf = &iview->image->color_surface.isl, + .view = &view, + .aux_surf = &iview->image->aux_surface.isl, + .aux_usage = state->attachments[i].aux_usage, + .clear_color = clear_color, + .mocs = cmd_buffer->device->default_mocs); + + add_image_view_relocs(cmd_buffer, iview, + state->attachments[i].aux_usage, + state->attachments[i].color_rt_state); + } else { + if (iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { + state->attachments[i].aux_usage = + layout_to_hiz_usage(att->initial_layout, iview->image->samples); + } else { + state->attachments[i].aux_usage = ISL_AUX_USAGE_NONE; + } + state->attachments[i].input_aux_usage = ISL_AUX_USAGE_NONE; + } + + if (need_input_attachment_state(&pass->attachments[i])) { + struct isl_view view = iview->isl; + view.usage |= ISL_SURF_USAGE_TEXTURE_BIT; + isl_surf_fill_state(isl_dev, + state->attachments[i].input_att_state.map, + .surf = &iview->image->color_surface.isl, + .view = &view, + .aux_surf = &iview->image->aux_surface.isl, + .aux_usage = state->attachments[i].input_aux_usage, + .clear_color = clear_color, + .mocs = cmd_buffer->device->default_mocs); + + add_image_view_relocs(cmd_buffer, iview, + state->attachments[i].input_aux_usage, + state->attachments[i].input_att_state); + } + } + + if (!cmd_buffer->device->info.has_llc) + anv_state_flush(state->render_pass_states); } } @@ -182,12 +616,14 @@ genX(BeginCommandBuffer)( if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { - cmd_buffer->state.framebuffer = - anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); cmd_buffer->state.pass = anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + cmd_buffer->state.framebuffer = NULL; + + genX(cmd_buffer_setup_attachments)(cmd_buffer, cmd_buffer->state.pass, + NULL); cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; } @@ -201,6 +637,13 @@ genX(EndCommandBuffer)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + /* We want every command buffer to start with the PMA fix in a known state, + * so we disable it at the end of the command buffer. + */ + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_cmd_buffer_end_batch_buffer(cmd_buffer); return VK_SUCCESS; @@ -216,11 +659,32 @@ genX(CmdExecuteCommands)( assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + /* The secondary command buffers will assume that the PMA fix is disabled + * when they begin executing. Make sure this is true. + */ + genX(cmd_buffer_enable_pma_fix)(primary, false); + for (uint32_t i = 0; i < commandBufferCount; i++) { ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + if (secondary->usage_flags & + VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + /* If we're continuing a render pass from the primary, we need to + * copy the surface states for the current subpass into the storage + * we allocated for them in BeginCommandBuffer. + */ + struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo; + struct anv_state src_state = primary->state.render_pass_states; + struct anv_state dst_state = secondary->state.render_pass_states; + assert(src_state.alloc_size == dst_state.alloc_size); + + genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset, + ss_bo, src_state.offset, + src_state.alloc_size); + } + anv_cmd_buffer_add_secondary(primary, secondary); } @@ -334,7 +798,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); /* Minimum number of ways that can be allocated to the URB. */ - const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0); + MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; assert(cfg->n[GEN_L3P_URB] >= n0_urb); uint32_t l3sqcr1, l3cr2, l3cr3; @@ -503,6 +967,13 @@ void genX(CmdPipelineBarrier)( for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { src_flags |= pImageMemoryBarriers[i].srcAccessMask; dst_flags |= pImageMemoryBarriers[i].dstAccessMask; + ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image); + if (pImageMemoryBarriers[i].subresourceRange.aspectMask & + VK_IMAGE_ASPECT_DEPTH_BIT) { + transition_depth_buffer(cmd_buffer, image, + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout); + } } enum anv_pipe_bits pipe_bits = 0; @@ -539,7 +1010,7 @@ void genX(CmdPipelineBarrier)( pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; break; case VK_ACCESS_SHADER_READ_BIT: - case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT: + case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: case VK_ACCESS_TRANSFER_READ_BIT: pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; break; @@ -617,58 +1088,11 @@ cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; } -static void -add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer, - struct anv_state state, struct anv_bo *bo, - uint32_t offset) -{ - /* The address goes in SURFACE_STATE dword 1 for gens < 8 and dwords 8 and - * 9 for gen8+. We only write the first dword for gen8+ here and rely on - * the initial state to set the high bits to 0. */ - - const uint32_t dword = GEN_GEN < 8 ? 1 : 8; - - anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, - state.offset + dword * 4, bo, offset); -} - -static struct anv_state -alloc_null_surface_state(struct anv_cmd_buffer *cmd_buffer, - struct anv_framebuffer *fb) -{ - struct anv_state state = - anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); - - struct GENX(RENDER_SURFACE_STATE) null_ss = { - .SurfaceType = SURFTYPE_NULL, - .SurfaceArray = fb->layers > 0, - .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM, -#if GEN_GEN >= 8 - .TileMode = YMAJOR, -#else - .TiledSurface = true, -#endif - .Width = fb->width - 1, - .Height = fb->height - 1, - .Depth = fb->layers - 1, - .RenderTargetViewExtent = fb->layers - 1, - }; - - GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss); - - if (!cmd_buffer->device->info.has_llc) - anv_state_clflush(state); - - return state; -} - - static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, struct anv_state *bt_state) { - struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; struct anv_subpass *subpass = cmd_buffer->state.subpass; struct anv_pipeline *pipeline; uint32_t bias, state_offset; @@ -738,25 +1162,16 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; struct anv_state surface_state; - struct anv_bo *bo; - uint32_t bo_offset; if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); assert(binding->binding == 0); if (binding->index < subpass->color_count) { - const struct anv_image_view *iview = - fb->attachments[subpass->color_attachments[binding->index]]; - - assert(iview->color_rt_surface_state.alloc_size); - surface_state = iview->color_rt_surface_state; - add_surface_state_reloc(cmd_buffer, iview->color_rt_surface_state, - iview->bo, iview->offset); + const unsigned att = subpass->color_attachments[binding->index]; + surface_state = cmd_buffer->state.attachments[att].color_rt_state; } else { - /* Null render target */ - struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; - surface_state = alloc_null_surface_state(cmd_buffer, fb); + surface_state = cmd_buffer->state.null_surface_state; } bt_map[bias + s] = surface_state.offset + state_offset; @@ -775,18 +1190,44 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: surface_state = desc->image_view->sampler_surface_state; assert(surface_state.alloc_size); - bo = desc->image_view->bo; - bo_offset = desc->image_view->offset; + add_image_view_relocs(cmd_buffer, desc->image_view, + desc->image_view->image->aux_usage, + surface_state); + break; + + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + assert(stage == MESA_SHADER_FRAGMENT); + if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) { + /* For depth and stencil input attachments, we treat it like any + * old texture that a user may have bound. + */ + surface_state = desc->image_view->sampler_surface_state; + assert(surface_state.alloc_size); + add_image_view_relocs(cmd_buffer, desc->image_view, + desc->image_view->image->aux_usage, + surface_state); + } else { + /* For color input attachments, we create the surface state at + * vkBeginRenderPass time so that we can include aux and clear + * color information. + */ + assert(binding->input_attachment_index < subpass->input_count); + const unsigned subpass_att = binding->input_attachment_index; + const unsigned att = subpass->input_attachments[subpass_att]; + surface_state = cmd_buffer->state.attachments[att].input_att_state; + } break; case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { - surface_state = desc->image_view->storage_surface_state; + surface_state = (binding->write_only) + ? desc->image_view->writeonly_storage_surface_state + : desc->image_view->storage_surface_state; assert(surface_state.alloc_size); - bo = desc->image_view->bo; - bo_offset = desc->image_view->offset; + add_image_view_relocs(cmd_buffer, desc->image_view, + desc->image_view->image->aux_usage, + surface_state); struct brw_image_param *image_param = &cmd_buffer->state.push_constants[stage]->images[image++]; @@ -803,15 +1244,19 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: surface_state = desc->buffer_view->surface_state; assert(surface_state.alloc_size); - bo = desc->buffer_view->bo; - bo_offset = desc->buffer_view->offset; + add_surface_state_reloc(cmd_buffer, surface_state, + desc->buffer_view->bo, + desc->buffer_view->offset); break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - surface_state = desc->buffer_view->storage_surface_state; + surface_state = (binding->write_only) + ? desc->buffer_view->writeonly_storage_surface_state + : desc->buffer_view->storage_surface_state; assert(surface_state.alloc_size); - bo = desc->buffer_view->bo; - bo_offset = desc->buffer_view->offset; + add_surface_state_reloc(cmd_buffer, surface_state, + desc->buffer_view->bo, + desc->buffer_view->offset); struct brw_image_param *image_param = &cmd_buffer->state.push_constants[stage]->images[image++]; @@ -826,13 +1271,12 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, } bt_map[bias + s] = surface_state.offset + state_offset; - add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset); } assert(image == map->image_count); out: if (!cmd_buffer->device->info.has_llc) - anv_state_clflush(*bt_state); + anv_state_flush(*bt_state); return VK_SUCCESS; } @@ -890,7 +1334,7 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer, } if (!cmd_buffer->device->info.has_llc) - anv_state_clflush(*state); + anv_state_flush(*state); return VK_SUCCESS; } @@ -1167,29 +1611,37 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) } static void -emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, - struct anv_bo *bo, uint32_t offset) +emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, + struct anv_bo *bo, uint32_t offset, + uint32_t size, uint32_t index) { uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_VERTEX_BUFFERS)); GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1, &(struct GENX(VERTEX_BUFFER_STATE)) { - .VertexBufferIndex = 32, /* Reserved for this */ + .VertexBufferIndex = index, .AddressModifyEnable = true, .BufferPitch = 0, #if (GEN_GEN >= 8) .MemoryObjectControlState = GENX(MOCS), .BufferStartingAddress = { bo, offset }, - .BufferSize = 8 + .BufferSize = size #else .VertexBufferMemoryObjectControlState = GENX(MOCS), .BufferStartingAddress = { bo, offset }, - .EndAddress = { bo, offset + 8 }, + .EndAddress = { bo, offset + size }, #endif }); } +static void +emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, + struct anv_bo *bo, uint32_t offset) +{ + emit_vertex_bo(cmd_buffer, bo, offset, 8, ANV_SVGS_VB_INDEX); +} + static void emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance) @@ -1201,12 +1653,28 @@ emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, ((uint32_t *)id_state.map)[1] = base_instance; if (!cmd_buffer->device->info.has_llc) - anv_state_clflush(id_state); + anv_state_flush(id_state); emit_base_vertex_instance_bo(cmd_buffer, &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset); } +static void +emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) +{ + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4); + + ((uint32_t *)state.map)[0] = draw_index; + + if (!cmd_buffer->device->info.has_llc) + anv_state_flush(state); + + emit_vertex_bo(cmd_buffer, + &cmd_buffer->device->dynamic_state_block_pool.bo, + state.offset, 4, ANV_DRAWID_VB_INDEX); +} + void genX(CmdDraw)( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -1222,6 +1690,8 @@ void genX(CmdDraw)( if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; @@ -1250,6 +1720,8 @@ void genX(CmdDrawIndexed)( if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = RANDOM; @@ -1288,6 +1760,8 @@ void genX(CmdDrawIndirect)( if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); @@ -1321,6 +1795,8 @@ void genX(CmdDrawIndexedIndirect)( /* TODO: We need to stomp base vertex to 0 somehow */ if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); @@ -1338,53 +1814,40 @@ void genX(CmdDrawIndexedIndirect)( static VkResult flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) { - struct anv_device *device = cmd_buffer->device; struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; struct anv_state surfaces = { 0, }, samplers = { 0, }; VkResult result; - result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); - if (result != VK_SUCCESS) - return result; result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) - return result; - - struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer); + if (result != VK_SUCCESS) { + assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); + result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); + assert(result == VK_SUCCESS); - const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + /* Re-emit state base addresses so we get the new surface state base + * address before we start emitting binding tables etc. + */ + genX(cmd_buffer_emit_state_base_address)(cmd_buffer); - if (push_state.alloc_size) { - anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { - curbe.CURBETotalDataLength = push_state.alloc_size; - curbe.CURBEDataStartAddress = push_state.offset; - } + result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); + assert(result == VK_SUCCESS); } - const uint32_t slm_size = encode_slm_size(GEN_GEN, prog_data->total_shared); + result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); + assert(result == VK_SUCCESS); + + uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .BindingTablePointer = surfaces.offset, + .SamplerStatePointer = samplers.offset, + }; + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); struct anv_state state = - anv_state_pool_emit(&device->dynamic_state_pool, - GENX(INTERFACE_DESCRIPTOR_DATA), 64, - .KernelStartPointer = pipeline->cs_simd, - .BindingTablePointer = surfaces.offset, - .BindingTableEntryCount = 0, - .SamplerStatePointer = samplers.offset, - .SamplerCount = 0, -#if !GEN_IS_HASWELL - .ConstantURBEntryReadOffset = 0, -#endif - .ConstantURBEntryReadLength = - cs_prog_data->push.per_thread.regs, -#if GEN_GEN >= 8 || GEN_IS_HASWELL - .CrossThreadConstantDataReadLength = - cs_prog_data->push.cross_thread.regs, -#endif - .BarrierEnable = cs_prog_data->uses_barrier, - .SharedLocalMemorySize = slm_size, - .NumberofThreadsinGPGPUThreadGroup = - cs_prog_data->threads); + anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, + pipeline->interface_descriptor_data, + GENX(INTERFACE_DESCRIPTOR_DATA_length), + 64); uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, @@ -1408,8 +1871,20 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) genX(flush_pipeline_select_gpgpu)(cmd_buffer); - if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) + if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) { + /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: + * + * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless + * the only bits that are changed are scoreboard related: Scoreboard + * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For + * these scoreboard related states, a MEDIA_STATE_FLUSH is + * sufficient." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); + } if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) { @@ -1419,6 +1894,18 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; } + if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { + struct anv_state push_state = + anv_cmd_buffer_cs_push_constants(cmd_buffer); + + if (push_state.alloc_size) { + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { + curbe.CURBETotalDataLength = push_state.alloc_size; + curbe.CURBEDataStartAddress = push_state.offset; + } + } + } + cmd_buffer->state.compute_dirty = 0; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); @@ -1426,18 +1913,17 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) #if GEN_GEN == 7 -static bool +static VkResult verify_cmd_parser(const struct anv_device *device, int required_version, const char *function) { if (device->instance->physicalDevice.cmd_parser_version < required_version) { - vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT, - "cmd parser version %d is required for %s", - required_version, function); - return false; + return vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT, + "cmd parser version %d is required for %s", + required_version, function); } else { - return true; + return VK_SUCCESS; } } @@ -1461,7 +1947,7 @@ void genX(CmdDispatch)( sizes[1] = y; sizes[2] = z; if (!cmd_buffer->device->info.has_llc) - anv_state_clflush(state); + anv_state_flush(state); cmd_buffer->state.num_workgroups_offset = state.offset; cmd_buffer->state.num_workgroups_bo = &cmd_buffer->device->dynamic_state_block_pool.bo; @@ -1508,7 +1994,8 @@ void genX(CmdDispatchIndirect)( /* Linux 4.4 added command parser version 5 which allows the GPGPU * indirect dispatch registers to be written. */ - if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect")) + if (verify_cmd_parser(cmd_buffer->device, 5, + "vkCmdDispatchIndirect") != VK_SUCCESS) return; #endif @@ -1661,6 +2148,80 @@ genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer) } } +void +genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) +{ + if (GEN_GEN >= 8) + return; + + /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER: + * + * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any + * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, + * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first + * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit + * set), followed by a pipelined depth cache flush (PIPE_CONTROL with + * Depth Flush Bit set, followed by another pipelined depth stall + * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise + * guarantee that the pipeline from WM onwards is already flushed (e.g., + * via a preceding MI_FLUSH)." + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthStallEnable = true; + } + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthCacheFlushEnable = true; + } + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthStallEnable = true; + } +} + +static uint32_t +depth_stencil_surface_type(enum isl_surf_dim dim) +{ + switch (dim) { + case ISL_SURF_DIM_1D: + if (GEN_GEN >= 9) { + /* From the Sky Lake PRM, 3DSTATAE_DEPTH_BUFFER::SurfaceType + * + * Programming Notes: + * The Surface Type of the depth buffer must be the same as the + * Surface Type of the render target(s) (defined in + * SURFACE_STATE), unless either the depth buffer or render + * targets are SURFTYPE_NULL (see exception below for SKL). 1D + * surface type not allowed for depth surface and stencil surface. + * + * Workaround: + * If depth/stencil is enabled with 1D render target, + * depth/stencil surface type needs to be set to 2D surface type + * and height set to 1. Depth will use (legacy) TileY and stencil + * will use TileW. For this case only, the Surface Type of the + * depth buffer can be 2D while the Surface Type of the render + * target(s) are 1D, representing an exception to a programming + * note above. + */ + return SURFTYPE_2D; + } else { + return SURFTYPE_1D; + } + case ISL_SURF_DIM_2D: + return SURFTYPE_2D; + case ISL_SURF_DIM_3D: + if (GEN_GEN >= 9) { + /* The Sky Lake docs list the value for 3D as "Reserved". However, + * they have the exact same layout as 2D arrays on gen9+, so we can + * just use 2D here. + */ + return SURFTYPE_2D; + } else { + return SURFTYPE_3D; + } + default: + unreachable("Invalid surface dimension"); + } +} + static void cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) { @@ -1670,25 +2231,26 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); const struct anv_image *image = iview ? iview->image : NULL; const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT); - const bool has_hiz = image != NULL && anv_image_has_hiz(image); + const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment; + const bool has_hiz = image != NULL && + cmd_buffer->state.attachments[ds].aux_usage == ISL_AUX_USAGE_HIZ; const bool has_stencil = image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); - /* FIXME: Implement the PMA stall W/A */ + cmd_buffer->state.hiz_enabled = has_hiz; + /* FIXME: Width and Height are wrong */ + genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); + /* Emit 3DSTATE_DEPTH_BUFFER */ if (has_depth) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) { - db.SurfaceType = SURFTYPE_2D; + db.SurfaceType = + depth_stencil_surface_type(image->depth_surface.isl.dim); db.DepthWriteEnable = true; db.StencilWriteEnable = has_stencil; - - if (cmd_buffer->state.pass->subpass_count == 1) { - db.HierarchicalDepthBufferEnable = has_hiz; - } else { - anv_finishme("Multiple-subpass HiZ not implemented"); - } + db.HierarchicalDepthBufferEnable = has_hiz; db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev, &image->depth_surface.isl); @@ -1703,14 +2265,17 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) db.Height = image->extent.height - 1; db.Width = image->extent.width - 1; db.LOD = iview->isl.base_level; - db.Depth = image->array_size - 1; /* FIXME: 3-D */ db.MinimumArrayElement = iview->isl.base_array_layer; + assert(image->depth_surface.isl.dim != ISL_SURF_DIM_3D); + db.Depth = + db.RenderTargetViewExtent = + iview->isl.array_len - iview->isl.base_array_layer - 1; + #if GEN_GEN >= 8 db.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2; #endif - db.RenderTargetViewExtent = 1 - 1; } } else { /* Even when no depth buffer is present, the hardware requires that @@ -1732,10 +2297,15 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) * be combined with a stencil buffer so we use D32_FLOAT instead. */ anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) { - db.SurfaceType = SURFTYPE_2D; + if (has_stencil) { + db.SurfaceType = + depth_stencil_surface_type(image->stencil_surface.isl.dim); + } else { + db.SurfaceType = SURFTYPE_2D; + } db.SurfaceFormat = D32_FLOAT; - db.Width = fb->width - 1; - db.Height = fb->height - 1; + db.Width = MAX2(fb->width, 1) - 1; + db.Height = MAX2(fb->height, 1) - 1; db.StencilWriteEnable = has_stencil; } } @@ -1743,10 +2313,10 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) if (has_hiz) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) { hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS); - hdb.SurfacePitch = image->hiz_surface.isl.row_pitch - 1; + hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1; hdb.SurfaceBaseAddress = (struct anv_address) { .bo = image->bo, - .offset = image->offset + image->hiz_surface.offset, + .offset = image->offset + image->aux_surface.offset, }; #if GEN_GEN >= 8 /* From the SKL PRM Vol2a: @@ -1756,11 +2326,14 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) * - SURFTYPE_1D: distance in pixels between array slices * - SURFTYPE_2D/CUBE: distance in rows between array slices * - SURFTYPE_3D: distance in rows between R - slices + * + * Unfortunately, the docs aren't 100% accurate here. They fail to + * mention that the 1-D rule only applies to linear 1-D images. + * Since depth and HiZ buffers are always tiled, they are treated as + * 2-D images. Prior to Sky Lake, this field is always in rows. */ hdb.SurfaceQPitch = - image->hiz_surface.isl.dim == ISL_SURF_DIM_1D ? - isl_surf_get_array_pitch_el(&image->hiz_surface.isl) >> 2 : - isl_surf_get_array_pitch_el_rows(&image->hiz_surface.isl) >> 2; + isl_surf_get_array_pitch_sa_rows(&image->aux_surface.isl) >> 2; #endif } } else { @@ -1804,10 +2377,7 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) { if (has_hiz) { cp.DepthClearValueValid = true; - const uint32_t ds = - cmd_buffer->state.subpass->depth_stencil_attachment; - cp.DepthClearValue = - cmd_buffer->state.attachments[ds].clear_value.depthStencil.depth; + cp.DepthClearValue = ANV_HZ_FC_VAL; } } } @@ -1820,9 +2390,22 @@ genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + const struct anv_image_view *iview = + anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); + + if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { + const uint32_t ds = subpass->depth_stencil_attachment; + transition_depth_buffer(cmd_buffer, iview->image, + cmd_buffer->state.attachments[ds].current_layout, + cmd_buffer->state.subpass->depth_stencil_layout); + cmd_buffer->state.attachments[ds].current_layout = + cmd_buffer->state.subpass->depth_stencil_layout; + cmd_buffer->state.attachments[ds].aux_usage = + layout_to_hiz_usage(cmd_buffer->state.subpass->depth_stencil_layout, + iview->image->samples); + } + cmd_buffer_emit_depth_stencil(cmd_buffer); - genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_HIZ_RESOLVE); - genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_CLEAR); anv_cmd_buffer_clear_subpass(cmd_buffer); } @@ -1839,7 +2422,7 @@ void genX(CmdBeginRenderPass)( cmd_buffer->state.framebuffer = framebuffer; cmd_buffer->state.pass = pass; cmd_buffer->state.render_area = pRenderPassBegin->renderArea; - anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin); + genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin); genX(flush_pipeline_select_3d)(cmd_buffer); @@ -1854,289 +2437,48 @@ void genX(CmdNextSubpass)( assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - anv_cmd_buffer_resolve_subpass(cmd_buffer); - genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1); -} - -void genX(CmdEndRenderPass)( - VkCommandBuffer commandBuffer) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - - genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_RESOLVE); - anv_cmd_buffer_resolve_subpass(cmd_buffer); - -#ifndef NDEBUG - anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); -#endif -} - -static void -emit_ps_depth_count(struct anv_batch *batch, - struct anv_bo *bo, uint32_t offset) -{ - anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { - pc.DestinationAddressType = DAT_PPGTT; - pc.PostSyncOperation = WritePSDepthCount; - pc.DepthStallEnable = true; - pc.Address = (struct anv_address) { bo, offset }; - } -} - -static void -emit_query_availability(struct anv_batch *batch, - struct anv_bo *bo, uint32_t offset) -{ - anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { - pc.DestinationAddressType = DAT_PPGTT; - pc.PostSyncOperation = WriteImmediateData; - pc.Address = (struct anv_address) { bo, offset }; - pc.ImmediateData = 1; - } -} + const struct anv_image_view *iview = + anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); -void genX(CmdBeginQuery)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - VkQueryControlFlags flags) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { + const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment; - /* Workaround: When meta uses the pipeline with the VS disabled, it seems - * that the pipelining of the depth write breaks. What we see is that - * samples from the render pass clear leaks into the first query - * immediately after the clear. Doing a pipecontrol with a post-sync - * operation and DepthStallEnable seems to work around the issue. - */ - if (cmd_buffer->state.need_query_wa) { - cmd_buffer->state.need_query_wa = false; - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.DepthCacheFlushEnable = true; - pc.DepthStallEnable = true; + if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses == + cmd_buffer->state.pass->attachments[ds].last_subpass_idx) { + transition_depth_buffer(cmd_buffer, iview->image, + cmd_buffer->state.attachments[ds].current_layout, + cmd_buffer->state.pass->attachments[ds].final_layout); } } - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - emit_ps_depth_count(&cmd_buffer->batch, &pool->bo, - query * sizeof(struct anv_query_pool_slot)); - break; - - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - default: - unreachable(""); - } -} - -void genX(CmdEndQuery)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - emit_ps_depth_count(&cmd_buffer->batch, &pool->bo, - query * sizeof(struct anv_query_pool_slot) + 8); - - emit_query_availability(&cmd_buffer->batch, &pool->bo, - query * sizeof(struct anv_query_pool_slot) + 16); - break; - - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - default: - unreachable(""); - } + anv_cmd_buffer_resolve_subpass(cmd_buffer); + genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1); } -#define TIMESTAMP 0x2358 - -void genX(CmdWriteTimestamp)( - VkCommandBuffer commandBuffer, - VkPipelineStageFlagBits pipelineStage, - VkQueryPool queryPool, - uint32_t query) +void genX(CmdEndRenderPass)( + VkCommandBuffer commandBuffer) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - uint32_t offset = query * sizeof(struct anv_query_pool_slot); - - assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); - - switch (pipelineStage) { - case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = TIMESTAMP; - srm.MemoryAddress = (struct anv_address) { &pool->bo, offset }; - } - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = TIMESTAMP + 4; - srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 4 }; - } - break; - default: - /* Everything else is bottom-of-pipe */ - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.DestinationAddressType = DAT_PPGTT; - pc.PostSyncOperation = WriteTimestamp; - pc.Address = (struct anv_address) { &pool->bo, offset }; - } - break; - } - - emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16); -} - -#if GEN_GEN > 7 || GEN_IS_HASWELL - -#define alu_opcode(v) __gen_uint((v), 20, 31) -#define alu_operand1(v) __gen_uint((v), 10, 19) -#define alu_operand2(v) __gen_uint((v), 0, 9) -#define alu(opcode, operand1, operand2) \ - alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2) - -#define OPCODE_NOOP 0x000 -#define OPCODE_LOAD 0x080 -#define OPCODE_LOADINV 0x480 -#define OPCODE_LOAD0 0x081 -#define OPCODE_LOAD1 0x481 -#define OPCODE_ADD 0x100 -#define OPCODE_SUB 0x101 -#define OPCODE_AND 0x102 -#define OPCODE_OR 0x103 -#define OPCODE_XOR 0x104 -#define OPCODE_STORE 0x180 -#define OPCODE_STOREINV 0x580 - -#define OPERAND_R0 0x00 -#define OPERAND_R1 0x01 -#define OPERAND_R2 0x02 -#define OPERAND_R3 0x03 -#define OPERAND_R4 0x04 -#define OPERAND_SRCA 0x20 -#define OPERAND_SRCB 0x21 -#define OPERAND_ACCU 0x31 -#define OPERAND_ZF 0x32 -#define OPERAND_CF 0x33 - -#define CS_GPR(n) (0x2600 + (n) * 8) - -static void -emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg, - struct anv_bo *bo, uint32_t offset) -{ - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = reg, - lrm.MemoryAddress = (struct anv_address) { bo, offset }; - } - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = reg + 4; - lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; - } -} + const struct anv_image_view *iview = + anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); -static void -store_query_result(struct anv_batch *batch, uint32_t reg, - struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags) -{ - anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = reg; - srm.MemoryAddress = (struct anv_address) { bo, offset }; - } + if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { + const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment; - if (flags & VK_QUERY_RESULT_64_BIT) { - anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = reg + 4; - srm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; + if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses == + cmd_buffer->state.pass->attachments[ds].last_subpass_idx) { + transition_depth_buffer(cmd_buffer, iview->image, + cmd_buffer->state.attachments[ds].current_layout, + cmd_buffer->state.pass->attachments[ds].final_layout); } } -} -void genX(CmdCopyQueryPoolResults)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t firstQuery, - uint32_t queryCount, - VkBuffer destBuffer, - VkDeviceSize destOffset, - VkDeviceSize destStride, - VkQueryResultFlags flags) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); - uint32_t slot_offset, dst_offset; - - if (flags & VK_QUERY_RESULT_WAIT_BIT) { - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } - } - - dst_offset = buffer->offset + destOffset; - for (uint32_t i = 0; i < queryCount; i++) { - - slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot); - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - emit_load_alu_reg_u64(&cmd_buffer->batch, - CS_GPR(0), &pool->bo, slot_offset); - emit_load_alu_reg_u64(&cmd_buffer->batch, - CS_GPR(1), &pool->bo, slot_offset + 8); - - /* FIXME: We need to clamp the result for 32 bit. */ - - uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); - dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); - dw[3] = alu(OPCODE_SUB, 0, 0); - dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); - break; - - case VK_QUERY_TYPE_TIMESTAMP: - emit_load_alu_reg_u64(&cmd_buffer->batch, - CS_GPR(2), &pool->bo, slot_offset); - break; - - default: - unreachable("unhandled query type"); - } - - store_query_result(&cmd_buffer->batch, - CS_GPR(2), buffer->bo, dst_offset, flags); - - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { - emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), - &pool->bo, slot_offset + 16); - if (flags & VK_QUERY_RESULT_64_BIT) - store_query_result(&cmd_buffer->batch, - CS_GPR(0), buffer->bo, dst_offset + 8, flags); - else - store_query_result(&cmd_buffer->batch, - CS_GPR(0), buffer->bo, dst_offset + 4, flags); - } + anv_cmd_buffer_resolve_subpass(cmd_buffer); - dst_offset += destStride; - } -} + cmd_buffer->state.hiz_enabled = false; -#else -void genX(CmdCopyQueryPoolResults)( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t firstQuery, - uint32_t queryCount, - VkBuffer destBuffer, - VkDeviceSize destOffset, - VkDeviceSize destStride, - VkQueryResultFlags flags) -{ - anv_finishme("Queries not yet supported on Ivy Bridge"); -} +#ifndef NDEBUG + anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); #endif +}