X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2FgenX_cmd_buffer.c;h=6a3e525eb3b9db29f448a95270856bafac0e8382;hb=6235f08ff8870636d89d2181e0a9dfc3ebec7b45;hp=f94b308f9953791d7539303e3afd9009ca719757;hpb=bc68aa42bd7fcbbf5d388a3804c3b41cc71812a8;p=mesa.git diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index f94b308f995..6a3e525eb3b 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -26,6 +26,7 @@ #include "anv_private.h" #include "vk_format_info.h" +#include "vk_util.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) } } +#if GEN_IS_HASWELL || GEN_GEN >= 8 +static void +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) +{ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { + lrr.SourceRegisterAddress = src; + lrr.DestinationRegisterAddress = dst; + } +} +#endif + void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { @@ -79,7 +91,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.SurfaceStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddress = - (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 }; + (struct anv_address) { &device->dynamic_state_pool.block_pool.bo, 0 }; sba.DynamicStateMemoryObjectControlState = GENX(MOCS); sba.DynamicStateBaseAddressModifyEnable = true; @@ -88,7 +100,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.IndirectObjectBaseAddressModifyEnable = true; sba.InstructionBaseAddress = - (struct anv_address) { &device->instruction_block_pool.bo, 0 }; + (struct anv_address) { &device->instruction_state_pool.block_pool.bo, 0 }; sba.InstructionMemoryObjectControlState = GENX(MOCS); sba.InstructionBaseAddressModifyEnable = true; @@ -167,17 +179,20 @@ add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer, } static void -add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, - const struct anv_image_view *iview, - enum isl_aux_usage aux_usage, - struct anv_state state) +add_image_relocs(struct anv_cmd_buffer * const cmd_buffer, + const struct anv_image * const image, + const VkImageAspectFlags aspect_mask, + const enum isl_aux_usage aux_usage, + const struct anv_state state) { const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const uint32_t surf_offset = image->offset + + anv_image_get_surface_for_aspect_mask(image, aspect_mask)->offset; - add_surface_state_reloc(cmd_buffer, state, iview->bo, iview->offset); + add_surface_state_reloc(cmd_buffer, state, image->bo, surf_offset); if (aux_usage != ISL_AUX_USAGE_NONE) { - uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset; + uint32_t aux_offset = image->offset + image->aux_surface.offset; /* On gen7 and prior, the bottom 12 bits of the MCS base address are * used to store other information. This should be ok, however, because @@ -191,7 +206,7 @@ add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.offset + isl_dev->ss.aux_addr_offset, - iview->bo, aux_offset); + image->bo, aux_offset); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -291,27 +306,21 @@ color_attachment_compute_aux_usage(struct anv_device *device, att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E; } else if (att_state->fast_clear) { att_state->aux_usage = ISL_AUX_USAGE_CCS_D; - if (GEN_GEN >= 9 && - !isl_format_supports_ccs_e(&device->info, iview->isl.format)) { - /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: - * - * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D - * setting is only allowed if Surface Format supported for Fast - * Clear. In addition, if the surface is bound to the sampling - * engine, Surface Format must be supported for Render Target - * Compression for surfaces bound to the sampling engine." - * - * In other words, we can't sample from a fast-cleared image if it - * doesn't also support color compression. - */ - att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - } else if (GEN_GEN >= 8) { - /* Broadwell/Skylake can sample from fast-cleared images */ + /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: + * + * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D + * setting is only allowed if Surface Format supported for Fast + * Clear. In addition, if the surface is bound to the sampling + * engine, Surface Format must be supported for Render Target + * Compression for surfaces bound to the sampling engine." + * + * In other words, we can only sample from a fast-cleared image if it + * also supports color compression. + */ + if (isl_format_supports_ccs_e(&device->info, iview->isl.format)) att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; - } else { - /* Ivy Bridge and Haswell cannot */ + else att_state->input_aux_usage = ISL_AUX_USAGE_NONE; - } } else { att_state->aux_usage = ISL_AUX_USAGE_NONE; att_state->input_aux_usage = ISL_AUX_USAGE_NONE; @@ -349,15 +358,8 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, * The undefined layout indicates that the user doesn't care about the data * that's currently in the buffer. Therefore, a data-preserving resolve * operation is not needed. - * - * The pre-initialized layout is equivalent to the undefined layout for - * optimally-tiled images. Anv only exposes support for optimally-tiled - * depth buffers. */ - if (image->aux_usage != ISL_AUX_USAGE_HIZ || - initial_layout == final_layout || - initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || - initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) + if (image->aux_usage != ISL_AUX_USAGE_HIZ || initial_layout == final_layout) return; const bool hiz_enabled = ISL_AUX_USAGE_HIZ == @@ -382,6 +384,37 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op); } +static void +transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + const uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout) +{ + if (image->aux_usage != ISL_AUX_USAGE_CCS_E) + return; + + if (initial_layout != VK_IMAGE_LAYOUT_UNDEFINED && + initial_layout != VK_IMAGE_LAYOUT_PREINITIALIZED) + return; + + /* A transition of a 3D subresource works on all slices at a time. */ + if (image->type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->extent.depth, base_level); + } + +#if GEN_GEN >= 9 + /* We're transitioning from an undefined layout so it doesn't really matter + * what data ends up in the color buffer. We do, however, need to ensure + * that the CCS has valid data in it. One easy way to do that is to + * fast-clear the specified range. + */ + anv_image_ccs_clear(cmd_buffer, image, base_level, level_count, + base_layer, layer_count); +#endif +} /** * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. @@ -411,23 +444,15 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); } - bool need_null_state = false; - unsigned num_states = 0; + /* Reserve one for the NULL state. */ + unsigned num_states = 1; for (uint32_t i = 0; i < pass->attachment_count; ++i) { - if (vk_format_is_color(pass->attachments[i].format)) { + if (vk_format_is_color(pass->attachments[i].format)) num_states++; - } else { - /* We need a null state for any depth-stencil-only subpasses. - * Importantly, this includes depth/stencil clears so we create one - * whenever we have depth or stencil - */ - need_null_state = true; - } if (need_input_attachment_state(&pass->attachments[i])) num_states++; } - num_states += need_null_state; const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); state->render_pass_states = @@ -437,11 +462,9 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, struct anv_state next_state = state->render_pass_states; next_state.alloc_size = isl_dev->ss.size; - if (need_null_state) { - state->null_surface_state = next_state; - next_state.offset += ss_stride; - next_state.map += ss_stride; - } + state->null_surface_state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; for (uint32_t i = 0; i < pass->attachment_count; ++i) { if (vk_format_is_color(pass->attachments[i].format)) { @@ -463,24 +486,22 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); assert(pass->attachment_count == framebuffer->attachment_count); - if (need_null_state) { - struct GENX(RENDER_SURFACE_STATE) null_ss = { - .SurfaceType = SURFTYPE_NULL, - .SurfaceArray = framebuffer->layers > 0, - .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM, + struct GENX(RENDER_SURFACE_STATE) null_ss = { + .SurfaceType = SURFTYPE_NULL, + .SurfaceArray = framebuffer->layers > 0, + .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM, #if GEN_GEN >= 8 - .TileMode = YMAJOR, + .TileMode = YMAJOR, #else - .TiledSurface = true, + .TiledSurface = true, #endif - .Width = framebuffer->width - 1, - .Height = framebuffer->height - 1, - .Depth = framebuffer->layers - 1, - .RenderTargetViewExtent = framebuffer->layers - 1, - }; - GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map, - &null_ss); - } + .Width = framebuffer->width - 1, + .Height = framebuffer->height - 1, + .Depth = framebuffer->layers - 1, + .RenderTargetViewExtent = framebuffer->layers - 1, + }; + GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map, + &null_ss); for (uint32_t i = 0; i < pass->attachment_count; ++i) { struct anv_render_pass_attachment *att = &pass->attachments[i]; @@ -531,9 +552,9 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, .clear_color = clear_color, .mocs = cmd_buffer->device->default_mocs); - add_image_view_relocs(cmd_buffer, iview, - state->attachments[i].aux_usage, - state->attachments[i].color_rt_state); + add_image_relocs(cmd_buffer, iview->image, iview->aspect_mask, + state->attachments[i].aux_usage, + state->attachments[i].color_rt_state); } else { /* This field will be initialized after the first subpass * transition. @@ -555,9 +576,9 @@ genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, .clear_color = clear_color, .mocs = cmd_buffer->device->default_mocs); - add_image_view_relocs(cmd_buffer, iview, - state->attachments[i].input_aux_usage, - state->attachments[i].input_att_state); + add_image_relocs(cmd_buffer, iview->image, iview->aspect_mask, + state->attachments[i].input_aux_usage, + state->attachments[i].input_att_state); } } @@ -597,6 +618,18 @@ genX(BeginCommandBuffer)( genX(cmd_buffer_emit_state_base_address)(cmd_buffer); + /* We sometimes store vertex data in the dynamic state buffer for blorp + * operations and our dynamic state stream may re-use data from previous + * command buffers. In order to prevent stale cache data, we flush the VF + * cache. We could do this on every blorp call but that's not really + * needed as all of the data will get written by the CPU prior to the GPU + * executing anything. The chances are fairly high that they will use + * blorp at least once per primary command buffer so it shouldn't be + * wasted. + */ + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + VkResult result = VK_SUCCESS; if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { @@ -671,7 +704,8 @@ genX(CmdExecuteCommands)( * copy the surface states for the current subpass into the storage * we allocated for them in BeginCommandBuffer. */ - struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo; + struct anv_bo *ss_bo = + &primary->device->surface_state_pool.block_pool.bo; struct anv_state src_state = primary->state.render_pass_states; struct anv_state dst_state = secondary->state.render_pass_states; assert(src_state.alloc_size == dst_state.alloc_size); @@ -811,7 +845,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, anv_pack_struct(&l3cr2, GENX(L3CNTLREG2), .SLMEnable = has_slm, .URBLowBandwidth = urb_low_bw, - .URBAllocation = cfg->n[GEN_L3P_URB], + .URBAllocation = cfg->n[GEN_L3P_URB] - n0_urb, #if !GEN_IS_HASWELL .ALLAllocation = cfg->n[GEN_L3P_ALL], #endif @@ -963,11 +997,21 @@ void genX(CmdPipelineBarrier)( src_flags |= pImageMemoryBarriers[i].srcAccessMask; dst_flags |= pImageMemoryBarriers[i].dstAccessMask; ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image); - if (pImageMemoryBarriers[i].subresourceRange.aspectMask & - VK_IMAGE_ASPECT_DEPTH_BIT) { + const VkImageSubresourceRange *range = + &pImageMemoryBarriers[i].subresourceRange; + + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { transition_depth_buffer(cmd_buffer, image, pImageMemoryBarriers[i].oldLayout, pImageMemoryBarriers[i].newLayout); + } else if (range->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT) { + transition_color_buffer(cmd_buffer, image, + range->baseMipLevel, + anv_get_levelCount(image, range), + range->baseArrayLayer, + anv_get_layerCount(image, range), + pImageMemoryBarriers[i].oldLayout, + pImageMemoryBarriers[i].newLayout); } } @@ -1122,8 +1166,20 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, assert(stage == MESA_SHADER_FRAGMENT); assert(binding->binding == 0); if (binding->index < subpass->color_count) { - const unsigned att = subpass->color_attachments[binding->index].attachment; - surface_state = cmd_buffer->state.attachments[att].color_rt_state; + const unsigned att = + subpass->color_attachments[binding->index].attachment; + + /* From the Vulkan 1.0.46 spec: + * + * "If any color or depth/stencil attachments are + * VK_ATTACHMENT_UNUSED, then no writes occur for those + * attachments." + */ + if (att == VK_ATTACHMENT_UNUSED) { + surface_state = cmd_buffer->state.null_surface_state; + } else { + surface_state = cmd_buffer->state.attachments[att].color_rt_state; + } } else { surface_state = cmd_buffer->state.null_surface_state; } @@ -1148,8 +1204,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->image_view->no_aux_sampler_surface_state : desc->image_view->sampler_surface_state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - desc->aux_usage, surface_state); + add_image_relocs(cmd_buffer, desc->image_view->image, + desc->image_view->aspect_mask, + desc->aux_usage, surface_state); break; case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: assert(stage == MESA_SHADER_FRAGMENT); @@ -1161,8 +1218,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->image_view->no_aux_sampler_surface_state : desc->image_view->sampler_surface_state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - desc->aux_usage, surface_state); + add_image_relocs(cmd_buffer, desc->image_view->image, + desc->image_view->aspect_mask, + desc->aux_usage, surface_state); } else { /* For color input attachments, we create the surface state at * vkBeginRenderPass time so that we can include aux and clear @@ -1180,9 +1238,9 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, ? desc->image_view->writeonly_storage_surface_state : desc->image_view->storage_surface_state; assert(surface_state.alloc_size); - add_image_view_relocs(cmd_buffer, desc->image_view, - desc->image_view->image->aux_usage, - surface_state); + add_image_relocs(cmd_buffer, desc->image_view->image, + desc->image_view->aspect_mask, + desc->image_view->image->aux_usage, surface_state); struct brw_image_param *image_param = &cmd_buffer->state.push_constants[stage]->images[image++]; @@ -1438,11 +1496,11 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer) c._3DCommandSubOpcode = push_constant_opcodes[stage], c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) { #if GEN_GEN >= 9 - .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset }, - .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32), + .Buffer[2] = { &cmd_buffer->device->dynamic_state_pool.block_pool.bo, state.offset }, + .ReadLength[2] = DIV_ROUND_UP(state.alloc_size, 32), #else - .PointerToConstantBuffer0 = { .offset = state.offset }, - .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32), + .Buffer[0] = { .offset = state.offset }, + .ReadLength[0] = DIV_ROUND_UP(state.alloc_size, 32), #endif }; } @@ -1488,7 +1546,12 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) .MemoryObjectControlState = GENX(MOCS), #else .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA, - .InstanceDataStepRate = 1, + /* Our implementation of VK_KHR_multiview uses instancing to draw + * the different views. If the client asks for instancing, we + * need to use the Instance Data Step Rate to ensure that we + * repeat the client's per-instance data once for each view. + */ + .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass), .VertexBufferMemoryObjectControlState = GENX(MOCS), #endif @@ -1639,7 +1702,7 @@ emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, anv_state_flush(cmd_buffer->device, id_state); emit_base_vertex_instance_bo(cmd_buffer, - &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset); + &cmd_buffer->device->dynamic_state_pool.block_pool.bo, id_state.offset); } static void @@ -1653,7 +1716,7 @@ emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) anv_state_flush(cmd_buffer->device, state); emit_vertex_bo(cmd_buffer, - &cmd_buffer->device->dynamic_state_block_pool.bo, + &cmd_buffer->device->dynamic_state_pool.block_pool.bo, state.offset, 4, ANV_DRAWID_VB_INDEX); } @@ -1678,6 +1741,11 @@ void genX(CmdDraw)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; @@ -1711,6 +1779,11 @@ void genX(CmdDrawIndexed)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; @@ -1730,6 +1803,112 @@ void genX(CmdDrawIndexed)( #define GEN7_3DPRIM_START_INSTANCE 0x243C #define GEN7_3DPRIM_BASE_VERTEX 0x2440 +/* MI_MATH only exists on Haswell+ */ +#if GEN_IS_HASWELL || GEN_GEN >= 8 + +static uint32_t +mi_alu(uint32_t opcode, uint32_t op1, uint32_t op2) +{ + struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { + .ALUOpcode = opcode, + .Operand1 = op1, + .Operand2 = op2, + }; + + uint32_t dw; + GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); + + return dw; +} + +#define CS_GPR(n) (0x2600 + (n) * 8) + +/* Emit dwords to multiply GPR0 by N */ +static void +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) +{ + VK_OUTARRAY_MAKE(out, dw, dw_count); + +#define append_alu(opcode, operand1, operand2) \ + vk_outarray_append(&out, alu_dw) *alu_dw = mi_alu(opcode, operand1, operand2) + + assert(N > 0); + unsigned top_bit = 31 - __builtin_clz(N); + for (int i = top_bit - 1; i >= 0; i--) { + /* We get our initial data in GPR0 and we write the final data out to + * GPR0 but we use GPR1 as our scratch register. + */ + unsigned src_reg = i == top_bit - 1 ? MI_ALU_REG0 : MI_ALU_REG1; + unsigned dst_reg = i == 0 ? MI_ALU_REG0 : MI_ALU_REG1; + + /* Shift the current value left by 1 */ + append_alu(MI_ALU_LOAD, MI_ALU_SRCA, src_reg); + append_alu(MI_ALU_LOAD, MI_ALU_SRCB, src_reg); + append_alu(MI_ALU_ADD, 0, 0); + + if (N & (1 << i)) { + /* Store ACCU to R1 and add R0 to R1 */ + append_alu(MI_ALU_STORE, MI_ALU_REG1, MI_ALU_ACCU); + append_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); + append_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); + append_alu(MI_ALU_ADD, 0, 0); + } + + append_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); + } + +#undef append_alu +} + +static void +emit_mul_gpr0(struct anv_batch *batch, uint32_t N) +{ + uint32_t num_dwords; + build_alu_multiply_gpr0(NULL, &num_dwords, N); + + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); + build_alu_multiply_gpr0(dw + 1, &num_dwords, N); +} + +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ + +static void +load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, + struct anv_buffer *buffer, uint64_t offset, + bool indexed) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_bo *bo = buffer->bo; + uint32_t bo_offset = buffer->offset + offset; + + emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); + + unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass); + if (view_count > 1) { +#if GEN_IS_HASWELL || GEN_GEN >= 8 + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); + emit_mul_gpr0(batch, view_count); + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); +#else + anv_finishme("Multiview + indirect draw requires MI_MATH\n" + "MI_MATH is not supported on Ivy Bridge"); + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); +#endif + } else { + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); + } + + emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); + + if (indexed) { + emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12); + emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16); + } else { + emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12); + emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0); + } +} + void genX(CmdDrawIndirect)( VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -1741,29 +1920,30 @@ void genX(CmdDrawIndirect)( ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; if (anv_batch_has_error(&cmd_buffer->batch)) return; genX(cmd_buffer_flush_state)(cmd_buffer); - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) - emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); - if (vs_prog_data->uses_drawid) - emit_draw_index(cmd_buffer, 0); + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_bo *bo = buffer->bo; + uint32_t bo_offset = buffer->offset + offset; - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12); - emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0); + if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); - anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { - prim.IndirectParameterEnable = true; - prim.VertexAccessType = SEQUENTIAL; - prim.PrimitiveTopologyType = pipeline->topology; + load_indirect_parameters(cmd_buffer, buffer, offset, false); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = pipeline->topology; + } + + offset += stride; } } @@ -1778,30 +1958,31 @@ void genX(CmdDrawIndexedIndirect)( ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); - struct anv_bo *bo = buffer->bo; - uint32_t bo_offset = buffer->offset + offset; if (anv_batch_has_error(&cmd_buffer->batch)) return; genX(cmd_buffer_flush_state)(cmd_buffer); - /* TODO: We need to stomp base vertex to 0 somehow */ - if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) - emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); - if (vs_prog_data->uses_drawid) - emit_draw_index(cmd_buffer, 0); + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_bo *bo = buffer->bo; + uint32_t bo_offset = buffer->offset + offset; - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12); - emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16); + /* TODO: We need to stomp base vertex to 0 somehow */ + if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); - anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { - prim.IndirectParameterEnable = true; - prim.VertexAccessType = RANDOM; - prim.PrimitiveTopologyType = pipeline->topology; + load_indirect_parameters(cmd_buffer, buffer, offset, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = pipeline->topology; + } + + offset += stride; } } @@ -1956,7 +2137,7 @@ void genX(CmdDispatch)( anv_state_flush(cmd_buffer->device, state); cmd_buffer->state.num_workgroups_offset = state.offset; cmd_buffer->state.num_workgroups_bo = - &cmd_buffer->device->dynamic_state_block_pool.bo; + &cmd_buffer->device->dynamic_state_pool.block_pool.bo; } genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -2173,208 +2354,68 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) } } -static uint32_t -depth_stencil_surface_type(enum isl_surf_dim dim) -{ - switch (dim) { - case ISL_SURF_DIM_1D: - if (GEN_GEN >= 9) { - /* From the Sky Lake PRM, 3DSTATAE_DEPTH_BUFFER::SurfaceType - * - * Programming Notes: - * The Surface Type of the depth buffer must be the same as the - * Surface Type of the render target(s) (defined in - * SURFACE_STATE), unless either the depth buffer or render - * targets are SURFTYPE_NULL (see exception below for SKL). 1D - * surface type not allowed for depth surface and stencil surface. - * - * Workaround: - * If depth/stencil is enabled with 1D render target, - * depth/stencil surface type needs to be set to 2D surface type - * and height set to 1. Depth will use (legacy) TileY and stencil - * will use TileW. For this case only, the Surface Type of the - * depth buffer can be 2D while the Surface Type of the render - * target(s) are 1D, representing an exception to a programming - * note above. - */ - return SURFTYPE_2D; - } else { - return SURFTYPE_1D; - } - case ISL_SURF_DIM_2D: - return SURFTYPE_2D; - case ISL_SURF_DIM_3D: - if (GEN_GEN >= 9) { - /* The Sky Lake docs list the value for 3D as "Reserved". However, - * they have the exact same layout as 2D arrays on gen9+, so we can - * just use 2D here. - */ - return SURFTYPE_2D; - } else { - return SURFTYPE_3D; - } - default: - unreachable("Invalid surface dimension"); - } -} - static void cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; - const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; const struct anv_image_view *iview = anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); const struct anv_image *image = iview ? iview->image : NULL; - const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT); - const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment.attachment; - const bool has_hiz = image != NULL && - cmd_buffer->state.attachments[ds].aux_usage == ISL_AUX_USAGE_HIZ; - const bool has_stencil = - image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); - - cmd_buffer->state.hiz_enabled = has_hiz; /* FIXME: Width and Height are wrong */ genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); - /* Emit 3DSTATE_DEPTH_BUFFER */ - if (has_depth) { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) { - db.SurfaceType = - depth_stencil_surface_type(image->depth_surface.isl.dim); - db.DepthWriteEnable = true; - db.StencilWriteEnable = has_stencil; - db.HierarchicalDepthBufferEnable = has_hiz; - - db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev, - &image->depth_surface.isl); - - db.SurfaceBaseAddress = (struct anv_address) { - .bo = image->bo, - .offset = image->offset + image->depth_surface.offset, - }; - db.DepthBufferObjectControlState = GENX(MOCS); + uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, + device->isl_dev.ds.size / 4); + if (dw == NULL) + return; - db.SurfacePitch = image->depth_surface.isl.row_pitch - 1; - db.Height = image->extent.height - 1; - db.Width = image->extent.width - 1; - db.LOD = iview->isl.base_level; - db.MinimumArrayElement = iview->isl.base_array_layer; + struct isl_depth_stencil_hiz_emit_info info = { + .mocs = device->default_mocs, + }; - assert(image->depth_surface.isl.dim != ISL_SURF_DIM_3D); - db.Depth = - db.RenderTargetViewExtent = iview->isl.array_len - 1; + if (iview) + info.view = &iview->isl; -#if GEN_GEN >= 8 - db.SurfaceQPitch = - isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2; -#endif - } - } else { - /* Even when no depth buffer is present, the hardware requires that - * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says: - * - * If a null depth buffer is bound, the driver must instead bind depth as: - * 3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D - * 3DSTATE_DEPTH.Width = 1 - * 3DSTATE_DEPTH.Height = 1 - * 3DSTATE_DEPTH.SuraceFormat = D16_UNORM - * 3DSTATE_DEPTH.SurfaceBaseAddress = 0 - * 3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0 - * 3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0 - * 3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0 - * - * The PRM is wrong, though. The width and height must be programmed to - * actual framebuffer's width and height, even when neither depth buffer - * nor stencil buffer is present. Also, D16_UNORM is not allowed to - * be combined with a stencil buffer so we use D32_FLOAT instead. - */ - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) { - if (has_stencil) { - db.SurfaceType = - depth_stencil_surface_type(image->stencil_surface.isl.dim); - } else { - db.SurfaceType = SURFTYPE_2D; - } - db.SurfaceFormat = D32_FLOAT; - db.Width = MAX2(fb->width, 1) - 1; - db.Height = MAX2(fb->height, 1) - 1; - db.StencilWriteEnable = has_stencil; - } - } + if (image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { + info.depth_surf = &image->depth_surface.isl; - if (has_hiz) { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) { - hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS); - hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1; - hdb.SurfaceBaseAddress = (struct anv_address) { - .bo = image->bo, - .offset = image->offset + image->aux_surface.offset, - }; -#if GEN_GEN >= 8 - /* From the SKL PRM Vol2a: - * - * The interpretation of this field is dependent on Surface Type - * as follows: - * - SURFTYPE_1D: distance in pixels between array slices - * - SURFTYPE_2D/CUBE: distance in rows between array slices - * - SURFTYPE_3D: distance in rows between R - slices - * - * Unfortunately, the docs aren't 100% accurate here. They fail to - * mention that the 1-D rule only applies to linear 1-D images. - * Since depth and HiZ buffers are always tiled, they are treated as - * 2-D images. Prior to Sky Lake, this field is always in rows. - */ - hdb.SurfaceQPitch = - isl_surf_get_array_pitch_sa_rows(&image->aux_surface.isl) >> 2; -#endif - } - } else { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb); - } + info.depth_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.depth_offset / 4, + image->bo, + image->offset + image->depth_surface.offset); - /* Emit 3DSTATE_STENCIL_BUFFER */ - if (has_stencil) { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) { -#if GEN_GEN >= 8 || GEN_IS_HASWELL - sb.StencilBufferEnable = true; -#endif - sb.StencilBufferObjectControlState = GENX(MOCS); + const uint32_t ds = + cmd_buffer->state.subpass->depth_stencil_attachment.attachment; + info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage; + if (info.hiz_usage == ISL_AUX_USAGE_HIZ) { + info.hiz_surf = &image->aux_surface.isl; - sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1; + info.hiz_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.hiz_offset / 4, + image->bo, + image->offset + image->aux_surface.offset); -#if GEN_GEN >= 8 - sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2; -#endif - sb.SurfaceBaseAddress = (struct anv_address) { - .bo = image->bo, - .offset = image->offset + image->stencil_surface.offset, - }; + info.depth_clear_value = ANV_HZ_FC_VAL; } - } else { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb); } - /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS: - * - * 3DSTATE_CLEAR_PARAMS must always be programmed in the along with - * the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER, - * 3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER) - * - * Testing also shows that some variant of this restriction may exist HSW+. - * On BDW+, it is not possible to emit 2 of these packets consecutively when - * both have DepthClearValueValid set. An analysis of such state programming - * on SKL showed that the GPU doesn't register the latter packet's clear - * value. - */ - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) { - if (has_hiz) { - cp.DepthClearValueValid = true; - cp.DepthClearValue = ANV_HZ_FC_VAL; - } + if (image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) { + info.stencil_surf = &image->stencil_surface.isl; + + info.stencil_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.stencil_offset / 4, + image->bo, + image->offset + image->stencil_surface.offset); } + + isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); + + cmd_buffer->state.hiz_enabled = info.hiz_usage == ISL_AUX_USAGE_HIZ; } @@ -2446,8 +2487,9 @@ cmd_buffer_subpass_transition_layouts(struct anv_cmd_buffer * const cmd_buffer, */ assert(att_ref->attachment < cmd_state->framebuffer->attachment_count); - const struct anv_image * const image = - cmd_state->framebuffer->attachments[att_ref->attachment]->image; + const struct anv_image_view * const iview = + cmd_state->framebuffer->attachments[att_ref->attachment]; + const struct anv_image * const image = iview->image; /* Perform the layout transition. */ if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { @@ -2456,6 +2498,12 @@ cmd_buffer_subpass_transition_layouts(struct anv_cmd_buffer * const cmd_buffer, att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, image->aspects, target_layout); + } else if (image->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + transition_color_buffer(cmd_buffer, image, + iview->isl.base_level, 1, + iview->isl.base_array_layer, + iview->isl.array_len, + att_state->current_layout, target_layout); } att_state->current_layout = target_layout; @@ -2470,6 +2518,16 @@ genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. If the client asks for instancing, we need to use the + * Instance Data Step Rate to ensure that we repeat the client's + * per-instance data once for each view. Since this bit is in + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top + * of each subpass. + */ + if (GEN_GEN == 7) + cmd_buffer->state.vb_dirty |= ~0; + /* Perform transitions to the subpass layout before any writes have * occurred. */