X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=36c56691521d58109833ee69049e8ca6868612e8;hb=9c9f63d1c749bfea25d6fcd78ff17ea2c49ec809;hp=ddb22c4539001191a429154483a1d0b544bb5fe8;hpb=d9d793696bf54e970491302605a1efd0aa182d1b;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index ddb22c45390..36c56691521 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -26,6 +26,7 @@ #include "anv_private.h" #include "vk_format_info.h" +#include "vk_util.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) } } +#if GEN_IS_HASWELL || GEN_GEN >= 8 +static void +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) +{ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { + lrr.SourceRegisterAddress = src; + lrr.DestinationRegisterAddress = dst; + } +} +#endif + void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { @@ -79,7 +91,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.SurfaceStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddress = - (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 }; + (struct anv_address) { &device->dynamic_state_pool.block_pool.bo, 0 }; sba.DynamicStateMemoryObjectControlState = GENX(MOCS); sba.DynamicStateBaseAddressModifyEnable = true; @@ -88,7 +100,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.IndirectObjectBaseAddressModifyEnable = true; sba.InstructionBaseAddress = - (struct anv_address) { &device->instruction_block_pool.bo, 0 }; + (struct anv_address) { &device->instruction_state_pool.block_pool.bo, 0 }; sba.InstructionMemoryObjectControlState = GENX(MOCS); sba.InstructionBaseAddressModifyEnable = true; @@ -167,17 +179,20 @@ add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer, } static void -add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, - const struct anv_image_view *iview, - enum isl_aux_usage aux_usage, - struct anv_state state) +add_image_relocs(struct anv_cmd_buffer * const cmd_buffer, + const struct anv_image * const image, + const VkImageAspectFlags aspect_mask, + const enum isl_aux_usage aux_usage, + const struct anv_state state) { const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const uint32_t surf_offset = image->offset + + anv_image_get_surface_for_aspect_mask(image, aspect_mask)->offset; - add_surface_state_reloc(cmd_buffer, state, iview->bo, iview->offset); + add_surface_state_reloc(cmd_buffer, state, image->bo, surf_offset); if (aux_usage != ISL_AUX_USAGE_NONE) { - uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset; + uint32_t aux_offset = image->offset + image->aux_surface.offset; /* On gen7 and prior, the bottom 12 bits of the MCS base address are * used to store other information. This should be ok, however, because @@ -191,7 +206,7 @@ add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.offset + isl_dev->ss.aux_addr_offset, - iview->bo, aux_offset); + image->bo, aux_offset); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -216,13 +231,19 @@ color_is_zero_one(VkClearColorValue value, enum isl_format format) } static void -color_attachment_compute_aux_usage(struct anv_device *device, - struct anv_attachment_state *att_state, - struct anv_image_view *iview, - VkRect2D render_area, +color_attachment_compute_aux_usage(struct anv_device * device, + struct anv_cmd_state * cmd_state, + uint32_t att, VkRect2D render_area, union isl_color_value *fast_clear_color) { - if (iview->image->aux_surface.isl.size == 0) { + struct anv_attachment_state *att_state = &cmd_state->attachments[att]; + struct anv_image_view *iview = cmd_state->framebuffer->attachments[att]; + + if (iview->isl.base_array_layer >= + anv_image_aux_layers(iview->image, iview->isl.base_level)) { + /* There is no aux buffer which corresponds to the level and layer(s) + * being accessed. + */ att_state->aux_usage = ISL_AUX_USAGE_NONE; att_state->input_aux_usage = ISL_AUX_USAGE_NONE; att_state->fast_clear = false; @@ -232,12 +253,47 @@ color_attachment_compute_aux_usage(struct anv_device *device, att_state->input_aux_usage = ISL_AUX_USAGE_MCS; att_state->fast_clear = false; return; + } else if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E) { + att_state->aux_usage = ISL_AUX_USAGE_CCS_E; + att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E; + } else { + att_state->aux_usage = ISL_AUX_USAGE_CCS_D; + /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: + * + * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D + * setting is only allowed if Surface Format supported for Fast + * Clear. In addition, if the surface is bound to the sampling + * engine, Surface Format must be supported for Render Target + * Compression for surfaces bound to the sampling engine." + * + * In other words, we can only sample from a fast-cleared image if it + * also supports color compression. + */ + if (isl_format_supports_ccs_e(&device->info, iview->isl.format)) { + /* TODO: Consider using a heuristic to determine if temporarily enabling + * CCS_E for this image view would be beneficial. + * + * While fast-clear resolves and partial resolves are fairly cheap in the + * case where you render to most of the pixels, full resolves are not + * because they potentially involve reading and writing the entire + * framebuffer. If we can't texture with CCS_E, we should leave it off and + * limit ourselves to fast clears. + */ + att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; + } else { + att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + } } assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT); att_state->clear_color_is_zero_one = color_is_zero_one(att_state->clear_value.color, iview->isl.format); + att_state->clear_color_is_zero = + att_state->clear_value.color.uint32[0] == 0 && + att_state->clear_value.color.uint32[1] == 0 && + att_state->clear_value.color.uint32[2] == 0 && + att_state->clear_value.color.uint32[3] == 0; if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { /* Start off assuming fast clears are possible */ @@ -253,21 +309,36 @@ color_attachment_compute_aux_usage(struct anv_device *device, render_area.extent.height != iview->extent.height) att_state->fast_clear = false; - if (GEN_GEN <= 7) { - /* On gen7, we can't do multi-LOD or multi-layer fast-clears. We - * technically can, but it comes with crazy restrictions that we - * don't want to deal with now. - */ - if (iview->isl.base_level > 0 || - iview->isl.base_array_layer > 0 || - iview->isl.array_len > 1) - att_state->fast_clear = false; - } - /* On Broadwell and earlier, we can only handle 0/1 clear colors */ if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one) att_state->fast_clear = false; + /* We allow fast clears when all aux layers of the miplevel are targeted. + * See add_fast_clear_state_buffer() for more information. Also, because + * we only either do a fast clear or a normal clear and not both, this + * complies with the gen7 restriction of not fast-clearing multiple + * layers. + */ + if (cmd_state->framebuffer->layers != + anv_image_aux_layers(iview->image, iview->isl.base_level)) { + att_state->fast_clear = false; + if (GEN_GEN == 7) { + anv_perf_warn("Not fast-clearing the first layer in " + "a multi-layer fast clear."); + } + } + + /* We only allow fast clears in the GENERAL layout if the auxiliary + * buffer is always enabled and the fast-clear value is all 0's. See + * add_fast_clear_state_buffer() for more information. + */ + if (cmd_state->pass->attachments[att].first_subpass_layout == + VK_IMAGE_LAYOUT_GENERAL && + (!att_state->clear_color_is_zero || + iview->image->aux_usage == ISL_AUX_USAGE_NONE)) { + att_state->fast_clear = false; + } + if (att_state->fast_clear) { memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32, sizeof(fast_clear_color->u32)); @@ -275,41 +346,6 @@ color_attachment_compute_aux_usage(struct anv_device *device, } else { att_state->fast_clear = false; } - - /** - * TODO: Consider using a heuristic to determine if temporarily enabling - * CCS_E for this image view would be beneficial. - * - * While fast-clear resolves and partial resolves are fairly cheap in the - * case where you render to most of the pixels, full resolves are not - * because they potentially involve reading and writing the entire - * framebuffer. If we can't texture with CCS_E, we should leave it off and - * limit ourselves to fast clears. - */ - if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E) { - att_state->aux_usage = ISL_AUX_USAGE_CCS_E; - att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E; - } else if (att_state->fast_clear) { - att_state->aux_usage = ISL_AUX_USAGE_CCS_D; - /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: - * - * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D - * setting is only allowed if Surface Format supported for Fast - * Clear. In addition, if the surface is bound to the sampling - * engine, Surface Format must be supported for Render Target - * Compression for surfaces bound to the sampling engine." - * - * In other words, we can only sample from a fast-cleared image if it - * also supports color compression. - */ - if (isl_format_supports_ccs_e(&device->info, iview->isl.format)) - att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; - else - att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - } else { - att_state->aux_usage = ISL_AUX_USAGE_NONE; - att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - } } static bool @@ -343,15 +379,8 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, * The undefined layout indicates that the user doesn't care about the data * that's currently in the buffer. Therefore, a data-preserving resolve * operation is not needed. - * - * The pre-initialized layout is equivalent to the undefined layout for - * optimally-tiled images. Anv only exposes support for optimally-tiled - * depth buffers. */ - if (image->aux_usage != ISL_AUX_USAGE_HIZ || - initial_layout == final_layout || - initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || - initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) + if (image->aux_usage != ISL_AUX_USAGE_HIZ || initial_layout == final_layout) return; const bool hiz_enabled = ISL_AUX_USAGE_HIZ == @@ -376,6 +405,176 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op); } +static inline uint32_t +get_fast_clear_state_entry_offset(const struct anv_device *device, + const struct anv_image *image, + unsigned level) +{ + assert(device && image); + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + assert(level < anv_image_aux_levels(image)); + const uint32_t offset = image->offset + image->aux_surface.offset + + image->aux_surface.isl.size + + anv_fast_clear_state_entry_size(device) * level; + assert(offset < image->offset + image->size); + return offset; +} + +static void +init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + unsigned level) +{ + assert(cmd_buffer && image); + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + assert(level < anv_image_aux_levels(image)); + + /* The fast clear value dword(s) will be copied into a surface state object. + * Ensure that the restrictions of the fields in the dword(s) are followed. + * + * CCS buffers on SKL+ can have any value set for the clear colors. + */ + if (image->samples == 1 && GEN_GEN >= 9) + return; + + /* Other combinations of auxiliary buffers and platforms require specific + * values in the clear value dword(s). + */ + unsigned i = 0; + for (; i < cmd_buffer->device->isl_dev.ss.clear_value_size; i += 4) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + const uint32_t entry_offset = + get_fast_clear_state_entry_offset(cmd_buffer->device, image, level); + sdi.Address = (struct anv_address) { image->bo, entry_offset + i }; + + if (GEN_GEN >= 9) { + /* MCS buffers on SKL+ can only have 1/0 clear colors. */ + assert(image->aux_usage == ISL_AUX_USAGE_MCS); + sdi.ImmediateData = 0; + } else if (GEN_VERSIONx10 >= 75) { + /* Pre-SKL, the dword containing the clear values also contains + * other fields, so we need to initialize those fields to match the + * values that would be in a color attachment. + */ + assert(i == 0); + sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 | + ISL_CHANNEL_SELECT_GREEN << 22 | + ISL_CHANNEL_SELECT_BLUE << 19 | + ISL_CHANNEL_SELECT_ALPHA << 16; + } else if (GEN_VERSIONx10 == 70) { + /* On IVB, the dword containing the clear values also contains + * other fields that must be zero or can be zero. + */ + assert(i == 0); + sdi.ImmediateData = 0; + } + } + } +} + +/* Copy the fast-clear value dword(s) between a surface state object and an + * image's fast clear state buffer. + */ +static void +genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, + struct anv_state surface_state, + const struct anv_image *image, + unsigned level, + bool copy_from_surface_state) +{ + assert(cmd_buffer && image); + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + assert(level < anv_image_aux_levels(image)); + + struct anv_bo *ss_bo = + &cmd_buffer->device->surface_state_pool.block_pool.bo; + uint32_t ss_clear_offset = surface_state.offset + + cmd_buffer->device->isl_dev.ss.clear_value_offset; + uint32_t entry_offset = + get_fast_clear_state_entry_offset(cmd_buffer->device, image, level); + unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; + + if (copy_from_surface_state) { + genX(cmd_buffer_mi_memcpy)(cmd_buffer, image->bo, entry_offset, + ss_bo, ss_clear_offset, copy_size); + } else { + genX(cmd_buffer_mi_memcpy)(cmd_buffer, ss_bo, ss_clear_offset, + image->bo, entry_offset, copy_size); + + /* Updating a surface state object may require that the state cache be + * invalidated. From the SKL PRM, Shared Functions -> State -> State + * Caching: + * + * Whenever the RENDER_SURFACE_STATE object in memory pointed to by + * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is + * modified [...], the L1 state cache must be invalidated to ensure + * the new surface or sampler state is fetched from system memory. + * + * In testing, SKL doesn't actually seem to need this, but HSW does. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; + } +} + +static void +transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + const uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout) +{ + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); + + if (image->aux_surface.isl.size == 0 || + base_level >= anv_image_aux_levels(image)) + return; + + if (initial_layout != VK_IMAGE_LAYOUT_UNDEFINED && + initial_layout != VK_IMAGE_LAYOUT_PREINITIALIZED) + return; + + /* A transition of a 3D subresource works on all slices at a time. */ + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->extent.depth, base_level); + } + + /* We're interested in the subresource range subset that has aux data. */ + level_count = MIN2(level_count, anv_image_aux_levels(image) - base_level); + + /* We're transitioning from an undefined layout. We must ensure that the + * clear values buffer is filled with valid data. + */ + for (unsigned l = 0; l < level_count; l++) + init_fast_clear_state_entry(cmd_buffer, image, base_level + l); + + if (image->aux_usage == ISL_AUX_USAGE_CCS_E || + image->aux_usage == ISL_AUX_USAGE_MCS) { + /* We're transitioning from an undefined layout so it doesn't really + * matter what data ends up in the color buffer. We do, however, need to + * ensure that the auxiliary surface is not in an undefined state. This + * state is possible for CCS buffers SKL+ and MCS buffers with certain + * sample counts that require certain bits to be reserved (2x and 8x). + * One easy way to get to a valid state is to fast-clear the specified + * range. + * + * Even for MCS buffers that have sample counts that don't require + * certain bits to be reserved (4x and 8x), we're unsure if the hardware + * will be okay with the sample mappings given by the undefined buffer. + * We don't have any data to show that this is a problem, but we want to + * avoid causing difficult-to-debug problems. + */ + if (image->samples == 4 || image->samples == 16) { + anv_perf_warn("Doing a potentially unnecessary fast-clear to define " + "an MCS buffer."); + } + + anv_image_fast_clear(cmd_buffer, image, base_level, level_count, + base_layer, layer_count); + } +} /** * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. @@ -390,19 +589,18 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, vk_free(&cmd_buffer->pool->alloc, state->attachments); - if (pass->attachment_count == 0) { + if (pass->attachment_count > 0) { + state->attachments = vk_alloc(&cmd_buffer->pool->alloc, + pass->attachment_count * + sizeof(state->attachments[0]), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (state->attachments == NULL) { + /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ + return anv_batch_set_error(&cmd_buffer->batch, + VK_ERROR_OUT_OF_HOST_MEMORY); + } + } else { state->attachments = NULL; - return VK_SUCCESS; - } - - state->attachments = vk_alloc(&cmd_buffer->pool->alloc, - pass->attachment_count * - sizeof(state->attachments[0]), - 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (state->attachments == NULL) { - /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ - return anv_batch_set_error(&cmd_buffer->batch, - VK_ERROR_OUT_OF_HOST_MEMORY); } /* Reserve one for the NULL state. */ @@ -497,8 +695,7 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, union isl_color_value clear_color = { .u32 = { 0, } }; if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { color_attachment_compute_aux_usage(cmd_buffer->device, - &state->attachments[i], - iview, begin->renderArea, + state, i, begin->renderArea, &clear_color); struct isl_view view = iview->isl; @@ -513,9 +710,9 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, .clear_color = clear_color, .mocs = cmd_buffer->device->default_mocs); - add_image_view_relocs(cmd_buffer, iview, - state->attachments[i].aux_usage, - state->attachments[i].color_rt_state); + add_image_relocs(cmd_buffer, iview->image, iview->aspect_mask, + state->attachments[i].aux_usage, + state->attachments[i].color_rt_state); } else { /* This field will be initialized after the first subpass * transition. @@ -537,9 +734,9 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, .clear_color = clear_color, .mocs = cmd_buffer->device->default_mocs); - add_image_view_relocs(cmd_buffer, iview, - state->attachments[i].input_aux_usage, - state->attachments[i].input_att_state); + add_image_relocs(cmd_buffer, iview->image, iview->aspect_mask, + state->attachments[i].input_aux_usage, + state->attachments[i].input_att_state); } } @@ -665,14 +862,15 @@ genX(CmdExecuteCommands)( * copy the surface states for the current subpass into the storage * we allocated for them in BeginCommandBuffer. */ - struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo; + struct anv_bo *ss_bo = + &primary->device->surface_state_pool.block_pool.bo; struct anv_state src_state = primary->state.render_pass_states; struct anv_state dst_state = secondary->state.render_pass_states; assert(src_state.alloc_size == dst_state.alloc_size); - genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset, - ss_bo, src_state.offset, - src_state.alloc_size); + genX(cmd_buffer_so_memcpy)(primary, ss_bo, dst_state.offset, + ss_bo, src_state.offset, + src_state.alloc_size); } anv_cmd_buffer_add_secondary(primary, secondary); @@ -805,7 +1003,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, anv_pack_struct(&l3cr2, GENX(L3CNTLREG2), .SLMEnable = has_slm, .URBLowBandwidth = urb_low_bw, - .URBAllocation = cfg->n[GEN_L3P_URB], + .URBAllocation = cfg->n[GEN_L3P_URB] - n0_urb, #if !GEN_IS_HASWELL .ALLAllocation = cfg->n[GEN_L3P_ALL], #endif @@ -957,11 +1155,21 @@ void genX(CmdPipelineBarrier)( src_flags |= pImageMemoryBarriers[i].srcAccessMask; dst_flags |= pImageMemoryBarriers[i].dstAccessMask; ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image); - if (pImageMemoryBarriers[i].subresourceRange.aspectMask & - VK_IMAGE_ASPECT_DEPTH_BIT) { + const VkImageSubresourceRange *range = + &pImageMemoryBarriers[i].subresourceRange; + + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, pImageMemoryBarriers[i].oldLayout, pImageMemoryBarriers[i].newLayout); + } else if (range->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT) { + transition_color_buffer(cmd_buffer, image, + range->baseMipLevel, + anv_get_levelCount(image, range), + range->baseArrayLayer, + anv_get_layerCount(image, range), + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout); } } @@ -1154,8 +1362,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->image_view->no_aux_sampler_surface_state : desc->image_view->sampler_surface_state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - desc->aux_usage, surface_state); + add_image_relocs(cmd_buffer, desc->image_view->image, + desc->image_view->aspect_mask, + desc->aux_usage, surface_state); break; case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: assert(stage == MESA_SHADER_FRAGMENT); @@ -1167,8 +1376,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->image_view->no_aux_sampler_surface_state : desc->image_view->sampler_surface_state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - desc->aux_usage, surface_state); + add_image_relocs(cmd_buffer, desc->image_view->image, + desc->image_view->aspect_mask, + desc->aux_usage, surface_state); } else { /* For color input attachments, we create the surface state at * vkBeginRenderPass time so that we can include aux and clear @@ -1186,9 +1396,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, ? desc->image_view->writeonly_storage_surface_state : desc->image_view->storage_surface_state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - desc->image_view->image->aux_usage, - surface_state); + add_image_relocs(cmd_buffer, desc->image_view->image, + desc->image_view->aspect_mask, + desc->image_view->image->aux_usage, surface_state); struct brw_image_param *image_param = &cmd_buffer->state.push_constants[stage]->images[image++]; @@ -1444,11 +1654,11 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer) c._3DCommandSubOpcode = push_constant_opcodes[stage], c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) { #if GEN_GEN >= 9 - .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset }, - .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32), + .Buffer[2] = { &cmd_buffer->device->dynamic_state_pool.block_pool.bo, state.offset }, + .ReadLength[2] = DIV_ROUND_UP(state.alloc_size, 32), #else - .PointerToConstantBuffer0 = { .offset = state.offset }, - .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32), + .Buffer[0] = { .offset = state.offset }, + .ReadLength[0] = DIV_ROUND_UP(state.alloc_size, 32), #endif }; } @@ -1494,7 +1704,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) .MemoryObjectControlState = GENX(MOCS), #else .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA, - .InstanceDataStepRate = 1, + /* Our implementation of VK_KHR_multiview uses instancing to draw + * the different views. If the client asks for instancing, we + * need to use the Instance Data Step Rate to ensure that we + * repeat the client's per-instance data once for each view. + */ + .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass), .VertexBufferMemoryObjectControlState = GENX(MOCS), #endif @@ -1645,7 +1860,7 @@ emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, anv_state_flush(cmd_buffer->device, id_state); emit_base_vertex_instance_bo(cmd_buffer, - &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset); + &cmd_buffer->device->dynamic_state_pool.block_pool.bo, id_state.offset); } static void @@ -1659,7 +1874,7 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) anv_state_flush(cmd_buffer->device, state); emit_vertex_bo(cmd_buffer, - &cmd_buffer->device->dynamic_state_block_pool.bo, + &cmd_buffer->device->dynamic_state_pool.block_pool.bo, state.offset, 4, ANV_DRAWID_VB_INDEX); } @@ -1684,6 +1899,11 @@ void genX(CmdDraw)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; @@ -1717,6 +1937,11 @@ void genX(CmdDrawIndexed)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; @@ -1736,6 +1961,112 @@ void genX(CmdDrawIndexed)( #define GEN7_3DPRIM_START_INSTANCE 0x243C #define GEN7_3DPRIM_BASE_VERTEX 0x2440 +/* MI_MATH only exists on Haswell+ */ +#if GEN_IS_HASWELL || GEN_GEN >= 8 + +static uint32_t +mi_alu(uint32_t opcode, uint32_t op1, uint32_t op2) +{ + struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { + .ALUOpcode = opcode, + .Operand1 = op1, + .Operand2 = op2, + }; + + uint32_t dw; + GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); + + return dw; +} + +#define CS_GPR(n) (0x2600 + (n) * 8) + +/* Emit dwords to multiply GPR0 by N */ +static void +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) +{ + VK_OUTARRAY_MAKE(out, dw, dw_count); + +#define append_alu(opcode, operand1, operand2) \ + vk_outarray_append(&out, alu_dw) *alu_dw = mi_alu(opcode, operand1, operand2) + + assert(N > 0); + unsigned top_bit = 31 - __builtin_clz(N); + for (int i = top_bit - 1; i >= 0; i--) { + /* We get our initial data in GPR0 and we write the final data out to + * GPR0 but we use GPR1 as our scratch register. + */ + unsigned src_reg = i == top_bit - 1 ? MI_ALU_REG0 : MI_ALU_REG1; + unsigned dst_reg = i == 0 ? MI_ALU_REG0 : MI_ALU_REG1; + + /* Shift the current value left by 1 */ + append_alu(MI_ALU_LOAD, MI_ALU_SRCA, src_reg); + append_alu(MI_ALU_LOAD, MI_ALU_SRCB, src_reg); + append_alu(MI_ALU_ADD, 0, 0); + + if (N & (1 << i)) { + /* Store ACCU to R1 and add R0 to R1 */ + append_alu(MI_ALU_STORE, MI_ALU_REG1, MI_ALU_ACCU); + append_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); + append_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); + append_alu(MI_ALU_ADD, 0, 0); + } + + append_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); + } + +#undef append_alu +} + +static void +emit_mul_gpr0(struct anv_batch *batch, uint32_t N) +{ + uint32_t num_dwords; + build_alu_multiply_gpr0(NULL, &num_dwords, N); + + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); + build_alu_multiply_gpr0(dw + 1, &num_dwords, N); +} + +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ + +static void +load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, + struct anv_buffer *buffer, uint64_t offset, + bool indexed) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_bo *bo = buffer->bo; + uint32_t bo_offset = buffer->offset + offset; + + emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); + + unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); + if (view_count > 1) { +#if GEN_IS_HASWELL || GEN_GEN >= 8 + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); + emit_mul_gpr0(batch, view_count); + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); +#else + anv_finishme("Multiview + indirect draw requires MI_MATH\n" + "MI_MATH is not supported on Ivy Bridge"); + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); +#endif + } else { + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); + } + + emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); + + if (indexed) { + emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12); + emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16); + } else { + emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12); + emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0); + } +} + void genX(CmdDrawIndirect)( VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -1747,29 +2078,30 @@ void genX(CmdDrawIndirect)( ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; if (anv_batch_has_error(&cmd_buffer->batch)) return; genX(cmd_buffer_flush_state)(cmd_buffer); - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) - emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); - if (vs_prog_data->uses_drawid) - emit_draw_index(cmd_buffer, 0); + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_bo *bo = buffer->bo; + uint32_t bo_offset = buffer->offset + offset; - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12); - emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0); + if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); - anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { - prim.IndirectParameterEnable = true; - prim.VertexAccessType = SEQUENTIAL; - prim.PrimitiveTopologyType = pipeline->topology; + load_indirect_parameters(cmd_buffer, buffer, offset, false); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = pipeline->topology; + } + + offset += stride; } } @@ -1784,30 +2116,31 @@ void genX(CmdDrawIndexedIndirect)( ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; if (anv_batch_has_error(&cmd_buffer->batch)) return; genX(cmd_buffer_flush_state)(cmd_buffer); - /* TODO: We need to stomp base vertex to 0 somehow */ - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) - emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); - if (vs_prog_data->uses_drawid) - emit_draw_index(cmd_buffer, 0); + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_bo *bo = buffer->bo; + uint32_t bo_offset = buffer->offset + offset; - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16); + /* TODO: We need to stomp base vertex to 0 somehow */ + if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); - anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { - prim.IndirectParameterEnable = true; - prim.VertexAccessType = RANDOM; - prim.PrimitiveTopologyType = pipeline->topology; + load_indirect_parameters(cmd_buffer, buffer, offset, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = pipeline->topology; + } + + offset += stride; } } @@ -1962,7 +2295,7 @@ void genX(CmdDispatch)( anv_state_flush(cmd_buffer->device, state); cmd_buffer->state.num_workgroups_offset = state.offset; cmd_buffer->state.num_workgroups_bo = - &cmd_buffer->device->dynamic_state_block_pool.bo; + &cmd_buffer->device->dynamic_state_pool.block_pool.bo; } genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -2312,8 +2645,9 @@ cmd_buffer_subpass_transition_layouts(struct anv_cmd_buffer * const cmd_buffer, */ assert(att_ref->attachment < cmd_state->framebuffer->attachment_count); - const struct anv_image * const image = - cmd_state->framebuffer->attachments[att_ref->attachment]->image; + const struct anv_image_view * const iview = + cmd_state->framebuffer->attachments[att_ref->attachment]; + const struct anv_image * const image = iview->image; /* Perform the layout transition. */ if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { @@ -2322,12 +2656,78 @@ cmd_buffer_subpass_transition_layouts(struct anv_cmd_buffer * const cmd_buffer, att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, image->aspects, target_layout); + } else if (image->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + transition_color_buffer(cmd_buffer, image, + iview->isl.base_level, 1, + iview->isl.base_array_layer, + iview->isl.array_len, + att_state->current_layout, target_layout); } att_state->current_layout = target_layout; } } +/* Update the clear value dword(s) in surface state objects or the fast clear + * state buffer entry for the color attachments used in this subpass. + */ +static void +cmd_buffer_subpass_sync_fast_clear_values(struct anv_cmd_buffer *cmd_buffer) +{ + assert(cmd_buffer && cmd_buffer->state.subpass); + + const struct anv_cmd_state *state = &cmd_buffer->state; + + /* Iterate through every color attachment used in this subpass. */ + for (uint32_t i = 0; i < state->subpass->color_count; ++i) { + + /* The attachment should be one of the attachments described in the + * render pass and used in the subpass. + */ + const uint32_t a = state->subpass->color_attachments[i].attachment; + assert(a < state->pass->attachment_count); + if (a == VK_ATTACHMENT_UNUSED) + continue; + + /* Store some information regarding this attachment. */ + const struct anv_attachment_state *att_state = &state->attachments[a]; + const struct anv_image_view *iview = state->framebuffer->attachments[a]; + const struct anv_render_pass_attachment *rp_att = + &state->pass->attachments[a]; + + if (att_state->aux_usage == ISL_AUX_USAGE_NONE) + continue; + + /* The fast clear state entry must be updated if a fast clear is going to + * happen. The surface state must be updated if the clear value from a + * prior fast clear may be needed. + */ + if (att_state->pending_clear_aspects && att_state->fast_clear) { + /* Update the fast clear state entry. */ + genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color_rt_state, + iview->image, iview->isl.base_level, + true /* copy from ss */); + } else if (rp_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { + /* The attachment may have been fast-cleared in a previous render + * pass and the value is needed now. Update the surface state(s). + * + * TODO: Do this only once per render pass instead of every subpass. + */ + genX(copy_fast_clear_dwords)(cmd_buffer, att_state->color_rt_state, + iview->image, iview->isl.base_level, + false /* copy to ss */); + + if (need_input_attachment_state(rp_att) && + att_state->input_aux_usage != ISL_AUX_USAGE_NONE) { + genX(copy_fast_clear_dwords)(cmd_buffer, att_state->input_att_state, + iview->image, iview->isl.base_level, + false /* copy to ss */); + } + } + } +} + + static void genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, struct anv_subpass *subpass) @@ -2336,11 +2736,30 @@ genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. If the client asks for instancing, we need to use the + * Instance Data Step Rate to ensure that we repeat the client's + * per-instance data once for each view. Since this bit is in + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top + * of each subpass. + */ + if (GEN_GEN == 7) + cmd_buffer->state.vb_dirty |= ~0; + /* Perform transitions to the subpass layout before any writes have * occurred. */ cmd_buffer_subpass_transition_layouts(cmd_buffer, false); + /* Update clear values *after* performing automatic layout transitions. + * This ensures that transitions from the UNDEFINED layout have had a chance + * to populate the clear value buffer with the correct values for the + * LOAD_OP_LOAD loadOp and that the fast-clears will update the buffer + * without the aforementioned layout transition overwriting the fast-clear + * value. + */ + cmd_buffer_subpass_sync_fast_clear_values(cmd_buffer); + cmd_buffer_emit_depth_stencil(cmd_buffer); anv_cmd_buffer_clear_subpass(cmd_buffer);