From cf54785239d3c53eb7baf89e914d7ef3e95ce11e Mon Sep 17 00:00:00 2001 From: Caio Marcelo de Oliveira Filho Date: Tue, 27 Mar 2018 10:10:34 -0700 Subject: [PATCH] anv/gen12: Lower VK_KHR_multiview using Primitive Replication Identify if view_index is used only for position calculation, and use Primitive Replication to implement Multiview in Gen12. This feature allows storing per-view position information in a single execution of the shader, treating position as an array. The shader is transformed by adding a for-loop around it, that have an iteration per active view (in the view_mask). Stores to the position now store into the position array for the current index in the loop, and load_view_index() will return the view index corresponding to the current index in the loop. The feature is controlled by setting the environment variable ANV_PRIMITIVE_REPLICATION_MAX_VIEWS, which defaults to 2 if unset. For pipelines with view counts larger than that, the regular instancing will be used instead of Primitive Replication. To disable it completely set the variable to 0. v2: Don't assume position is set in vertex shader; remove only stores for position; don't apply optimizations since other passes will do; clone shader body without extract/reinsert; don't use last_block (potentially stale). (Jason) Fix view_index immediate to contain the view index, not its order. Check for maximum number of views supported. Add guard for gen12. v3: Clone the entire shader function and change it before reinsert; disable optimization when shader has memory writes. (Jason) Use a single environment variable with _DEBUG on the name. v4: Change to use new nir_deref_instr. When removing stores, look for mode nir_var_shader_out instead of the walking the list of outputs. Ensure unused derefs are removed in the non-position part of the shader. Remove dead control flow when identifying if can use or not primitive replication. v5: Consider all the active shaders (including fragment) when deciding that Primitive Replication can be used. Change environment variable to ANV_PRIMITIVE_REPLICATION. Squash the emission of 3DSTATE_PRIMITIVE_REPLICATION into this patch. Disable Prim Rep in blorp_exec_3d. v6: Use a loop around the shader, instead of manually unrolling, since the regular unroll pass will kick in. Document that we don't expect to see copy_deref or load_deref involving the position variable. Recover use_primitive_replication value when loading pipeline from the cache. Set VARYING_SLOT_LAYER to 0 in the shader. Earlier versions were relying on ForceZeroRTAIndexEnable but that might not be sufficient. Disable Prim Rep in cmd_buffer_so_memcpy. v7: Don't use Primitive Replication if position is not set, fallback to instancing; change environment variable to be ANV_PRIMITVE_REPLICATION_MAX_VIEWS and default it to 2 based on experiments. Reviewed-by: Rafael Antognolli Tested-by: Marge Bot Part-of: --- src/intel/blorp/blorp_genX_exec.h | 5 + src/intel/vulkan/anv_nir.h | 6 +- src/intel/vulkan/anv_nir_lower_multiview.c | 368 ++++++++++++++++++++- src/intel/vulkan/anv_pipeline.c | 58 +++- src/intel/vulkan/anv_private.h | 6 + src/intel/vulkan/genX_cmd_buffer.c | 9 +- src/intel/vulkan/genX_gpu_memcpy.c | 5 + src/intel/vulkan/genX_pipeline.c | 30 ++ 8 files changed, 471 insertions(+), 16 deletions(-) diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h index b3adc6ad41e..6e74683f4e0 100644 --- a/src/intel/blorp/blorp_genX_exec.h +++ b/src/intel/blorp/blorp_genX_exec.h @@ -1350,6 +1350,11 @@ blorp_emit_pipeline(struct blorp_batch *batch, blorp_emit_ps_config(batch, params); blorp_emit_cc_viewport(batch); + +#if GEN_GEN >= 12 + /* Disable Primitive Replication. */ + blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); +#endif } /******** This is the end of the pipeline setup code ********/ diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index a3c5838cd50..9095f2d58d3 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -31,7 +31,11 @@ extern "C" { #endif -bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask); +bool anv_check_for_primitive_replication(nir_shader **shaders, + struct anv_graphics_pipeline *pipeline); + +bool anv_nir_lower_multiview(nir_shader *shader, + struct anv_graphics_pipeline *pipeline); bool anv_nir_lower_ycbcr_textures(nir_shader *shader, const struct anv_pipeline_layout *layout); diff --git a/src/intel/vulkan/anv_nir_lower_multiview.c b/src/intel/vulkan/anv_nir_lower_multiview.c index ffe9e7bb972..d239074c61b 100644 --- a/src/intel/vulkan/anv_nir_lower_multiview.c +++ b/src/intel/vulkan/anv_nir_lower_multiview.c @@ -23,13 +23,18 @@ #include "anv_nir.h" #include "nir/nir_builder.h" +#include "util/debug.h" /** - * This file implements the lowering required for VK_KHR_multiview. We - * implement multiview using instanced rendering. The number of instances in - * each draw call is multiplied by the number of views in the subpass. Then, - * in the shader, we divide gl_InstanceId by the number of views and use - * gl_InstanceId % view_count to compute the actual ViewIndex. + * This file implements the lowering required for VK_KHR_multiview. + * + * When possible, Primitive Replication is used and the shader is modified to + * make gl_Position an array and fill it with values for each view. + * + * Otherwise we implement multiview using instanced rendering. The number of + * instances in each draw call is multiplied by the number of views in the + * subpass. Then, in the shader, we divide gl_InstanceId by the number of + * views and use gl_InstanceId % view_count to compute the actual ViewIndex. */ struct lower_multiview_state { @@ -145,15 +150,180 @@ build_view_index(struct lower_multiview_state *state) return state->view_index; } +/* Primitive Replication allows a shader to write different positions for each + * view in the same execution. If only the position depends on the view, then + * it is possible to use the feature instead of instancing to implement + * multiview. + */ +static bool +lower_multiview_with_primitive_replication(nir_shader *shader, + struct anv_graphics_pipeline *pipeline) +{ + if (shader->info.stage == MESA_SHADER_FRAGMENT) + return false; + + assert(shader->info.stage == MESA_SHADER_VERTEX); + + uint32_t view_mask = pipeline->subpass->view_mask; + int view_count = util_bitcount(view_mask); + assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION); + + nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader); + + /* Update position to refer to an array. */ + nir_variable *pos_var = NULL; + nir_foreach_variable(var, &shader->outputs) { + if (var->data.location == VARYING_SLOT_POS) { + assert(var->type == glsl_vec4_type()); + var->type = glsl_array_type(glsl_vec4_type(), view_count, 0); + var->data.per_view = true; + pos_var = var; + break; + } + } + + assert(pos_var); + + nir_cf_list body; + nir_cf_list_extract(&body, &entrypoint->body); + + nir_builder b; + nir_builder_init(&b, entrypoint); + b.cursor = nir_after_cf_list(&entrypoint->body); + + /* Fill Layer ID with zero. Replication will use that as base to apply the + * RTAI offsets. + */ + nir_variable *layer_id_out = + nir_variable_create(shader, nir_var_shader_out, + glsl_int_type(), "layer ID"); + layer_id_out->data.location = VARYING_SLOT_LAYER; + nir_store_var(&b, layer_id_out, nir_imm_zero(&b, 1, 32), 0x1); + + /* Loop Index will go from 0 to view_count. */ + nir_variable *loop_index_var = + nir_local_variable_create(entrypoint, glsl_uint_type(), "loop_index"); + nir_deref_instr *loop_index_deref = nir_build_deref_var(&b, loop_index_var); + nir_store_deref(&b, loop_index_deref, nir_imm_int(&b, 0), 1); + + /* Array of view index values that are active in the loop. Note that the + * loop index only matches the view index if there are no gaps in the + * view_mask. + */ + nir_variable *view_index_var = nir_local_variable_create( + entrypoint, glsl_array_type(glsl_uint_type(), view_count, 0), "view_index"); + nir_deref_instr *view_index_deref = nir_build_deref_var(&b, view_index_var); + { + int array_position = 0; + uint32_t view_index; + for_each_bit(view_index, view_mask) { + nir_store_deref(&b, nir_build_deref_array_imm(&b, view_index_deref, array_position), + nir_imm_int(&b, view_index), 1); + array_position++; + } + } + + /* Create the equivalent of + * + * while (true): + * if (loop_index >= view_count): + * break + * + * view_index = active_indices[loop_index] + * pos_deref = &pos[loop_index] + * + * # Placeholder for the body to be reinserted. + * + * loop_index += 1 + * + * Later both `view_index` and `pos_deref` will be used to rewrite the + * original shader body. + */ + + nir_loop* loop = nir_push_loop(&b); + + nir_ssa_def *loop_index = nir_load_deref(&b, loop_index_deref); + nir_ssa_def *cmp = nir_ige(&b, loop_index, nir_imm_int(&b, view_count)); + nir_if *loop_check = nir_push_if(&b, cmp); + nir_jump(&b, nir_jump_break); + nir_pop_if(&b, loop_check); + + nir_ssa_def *view_index = + nir_load_deref(&b, nir_build_deref_array(&b, view_index_deref, loop_index)); + nir_deref_instr *pos_deref = + nir_build_deref_array(&b, nir_build_deref_var(&b, pos_var), loop_index); + + nir_store_deref(&b, loop_index_deref, nir_iadd_imm(&b, loop_index, 1), 1); + nir_pop_loop(&b, loop); + + /* Reinsert the body. */ + b.cursor = nir_after_instr(&pos_deref->instr); + nir_cf_reinsert(&body, b.cursor); + + nir_foreach_block(block, entrypoint) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + switch (intrin->intrinsic) { + case nir_intrinsic_load_view_index: { + assert(intrin->dest.is_ssa); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(view_index)); + break; + } + + case nir_intrinsic_store_deref: { + nir_variable *var = nir_intrinsic_get_var(intrin, 0); + if (var == pos_var) { + nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]); + + nir_instr_rewrite_src(instr, &intrin->src[0], + nir_src_for_ssa(&pos_deref->dest.ssa)); + + /* Remove old deref since it has the wrong type. */ + nir_deref_instr_remove_if_unused(old_deref); + } + break; + } + + case nir_intrinsic_load_deref: + if (nir_intrinsic_get_var(intrin, 0) == pos_var) { + unreachable("Should have lowered I/O to temporaries " + "so no load_deref on position output is expected."); + } + break; + + case nir_intrinsic_copy_deref: + unreachable("Should have lowered copy_derefs at this point"); + break; + + default: + /* Do nothing. */ + break; + } + } + } + + nir_metadata_preserve(entrypoint, nir_metadata_none); + return true; +} + bool -anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask) +anv_nir_lower_multiview(nir_shader *shader, + struct anv_graphics_pipeline *pipeline) { assert(shader->info.stage != MESA_SHADER_COMPUTE); + uint32_t view_mask = pipeline->subpass->view_mask; /* If multiview isn't enabled, we have nothing to do. */ if (view_mask == 0) return false; + if (pipeline->use_primitive_replication) + return lower_multiview_with_primitive_replication(shader, pipeline); + struct lower_multiview_state state = { .view_mask = view_mask, }; @@ -230,3 +400,189 @@ anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask) return progress; } + +static bool +shader_writes_to_memory(nir_shader *shader) +{ + /* With multiview, we would need to ensure that memory writes happen either + * once or once per view. Since combination of multiview and memory writes + * is not expected, we'll just skip this optimization in this case. + */ + + nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader); + + nir_foreach_block(block, entrypoint) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + switch (intrin->intrinsic) { + case nir_intrinsic_deref_atomic_add: + case nir_intrinsic_deref_atomic_imin: + case nir_intrinsic_deref_atomic_umin: + case nir_intrinsic_deref_atomic_imax: + case nir_intrinsic_deref_atomic_umax: + case nir_intrinsic_deref_atomic_and: + case nir_intrinsic_deref_atomic_or: + case nir_intrinsic_deref_atomic_xor: + case nir_intrinsic_deref_atomic_exchange: + case nir_intrinsic_deref_atomic_comp_swap: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_store_shared: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + return true; + + default: + /* Keep walking. */ + break; + } + } + } + + return false; +} + +static bool +shader_uses_view_index(nir_shader *shader) +{ + nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader); + + nir_foreach_block(block, entrypoint) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_load_view_index) + return true; + } + } + + return false; +} + +static bool +shader_only_position_uses_view_index(nir_shader *shader) +{ + nir_shader *shader_no_position = nir_shader_clone(NULL, shader); + nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader_no_position); + + /* Remove the store position from a cloned shader. */ + nir_foreach_block(block, entrypoint) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *store = nir_instr_as_intrinsic(instr); + if (store->intrinsic != nir_intrinsic_store_deref) + continue; + + nir_variable *var = nir_intrinsic_get_var(store, 0); + if (var->data.location != VARYING_SLOT_POS) + continue; + + nir_instr_remove(&store->instr); + } + } + + /* Clean up shader so unused load_view_index intrinsics are removed. */ + bool progress; + do { + progress = false; + progress |= nir_opt_dead_cf(shader_no_position); + + /* Peephole select will drop if-blocks that have then and else empty, + * which will remove the usage of an SSA in the condition. + */ + progress |= nir_opt_peephole_select(shader_no_position, 0, false, false); + + progress |= nir_opt_dce(shader_no_position); + } while (progress); + + bool uses_view_index = shader_uses_view_index(shader_no_position); + + ralloc_free(shader_no_position); + return !uses_view_index; +} + +bool +anv_check_for_primitive_replication(nir_shader **shaders, + struct anv_graphics_pipeline *pipeline) +{ + assert(pipeline->base.device->info.gen >= 12); + + static int primitive_replication_max_views = -1; + if (primitive_replication_max_views < 0) { + /* TODO: Figure out why we are not getting same benefits for larger than + * 2 views. For now use Primitive Replication just for the 2-view case + * by default. + */ + const unsigned default_max_views = 2; + + primitive_replication_max_views = + MIN2(MAX_VIEWS_FOR_PRIMITIVE_REPLICATION, + env_var_as_unsigned("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS", + default_max_views)); + } + + /* TODO: We should be able to support replication at 'geometry' stages + * later than Vertex. In that case only the last stage can refer to + * gl_ViewIndex. + */ + if (pipeline->active_stages != (VK_SHADER_STAGE_VERTEX_BIT | + VK_SHADER_STAGE_FRAGMENT_BIT)) { + return false; + } + + uint32_t view_mask = pipeline->subpass->view_mask; + int view_count = util_bitcount(view_mask); + if (view_count == 1 || view_count > primitive_replication_max_views) + return false; + + bool vs_writes_position = false; + nir_foreach_variable(var, &shaders[MESA_SHADER_VERTEX]->outputs) { + if (var->data.location == VARYING_SLOT_POS) { + vs_writes_position = true; + break; + } + } + + /* Don't bother handling this edge case with Primitive Replication. */ + if (!vs_writes_position) + return false; + + return !shader_uses_view_index(shaders[MESA_SHADER_FRAGMENT]) && + !shader_writes_to_memory(shaders[MESA_SHADER_VERTEX]) && + shader_only_position_uses_view_index(shaders[MESA_SHADER_VERTEX]); +} diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index c1d0e393a96..9ccf638ed40 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -687,7 +687,7 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, if (pipeline->type == ANV_PIPELINE_GRAPHICS) { NIR_PASS_V(nir, anv_nir_lower_multiview, - anv_pipeline_to_graphics(pipeline)->subpass->view_mask); + anv_pipeline_to_graphics(pipeline)); } nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); @@ -735,16 +735,23 @@ anv_pipeline_link_vs(const struct brw_compiler *compiler, static void anv_pipeline_compile_vs(const struct brw_compiler *compiler, void *mem_ctx, - struct anv_device *device, + struct anv_graphics_pipeline *pipeline, struct anv_pipeline_stage *vs_stage) { + /* When using Primitive Replication for multiview, each view gets its own + * position slot. + */ + uint32_t pos_slots = pipeline->use_primitive_replication ? + anv_subpass_view_count(pipeline->subpass) : 1; + brw_compute_vue_map(compiler->devinfo, &vs_stage->prog_data.vs.base.vue_map, vs_stage->nir->info.outputs_written, - vs_stage->nir->info.separate_shader, 1); + vs_stage->nir->info.separate_shader, + pos_slots); vs_stage->num_stats = 1; - vs_stage->code = brw_compile_vs(compiler, device, mem_ctx, + vs_stage->code = brw_compile_vs(compiler, pipeline->base.device, mem_ctx, &vs_stage->key.vs, &vs_stage->prog_data.vs, vs_stage->nir, -1, @@ -1167,6 +1174,27 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline, } } +static void +anv_pipeline_init_from_cached_graphics(struct anv_graphics_pipeline *pipeline) +{ + /* TODO: Cache this pipeline-wide information. */ + + /* Primitive replication depends on information from all the shaders. + * Recover this bit from the fact that we have more than one position slot + * in the vertex shader when using it. + */ + assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT); + int pos_slots = 0; + const struct brw_vue_prog_data *vue_prog_data = + (const void *) pipeline->shaders[MESA_SHADER_VERTEX]->prog_data; + const struct brw_vue_map *vue_map = &vue_prog_data->vue_map; + for (int i = 0; i < vue_map->num_slots; i++) { + if (vue_map->slot_to_varying[i] == VARYING_SLOT_POS) + pos_slots++; + } + pipeline->use_primitive_replication = pos_slots > 1; +} + static VkResult anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, struct anv_pipeline_cache *cache, @@ -1295,6 +1323,7 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, anv_pipeline_add_executables(&pipeline->base, &stages[s], pipeline->shaders[s]); } + anv_pipeline_init_from_cached_graphics(pipeline); goto done; } else if (found > 0) { /* We found some but not all of our shaders. This shouldn't happen @@ -1383,6 +1412,23 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, next_stage = &stages[s]; } + if (pipeline->base.device->info.gen >= 12 && + pipeline->subpass->view_mask != 0) { + /* For some pipelines HW Primitive Replication can be used instead of + * instancing to implement Multiview. This depend on how viewIndex is + * used in all the active shaders, so this check can't be done per + * individual shaders. + */ + nir_shader *shaders[MESA_SHADER_STAGES] = {}; + for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) + shaders[s] = stages[s].nir; + + pipeline->use_primitive_replication = + anv_check_for_primitive_replication(shaders, pipeline); + } else { + pipeline->use_primitive_replication = false; + } + struct anv_pipeline_stage *prev_stage = NULL; for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) { if (!stages[s].entrypoint) @@ -1402,7 +1448,7 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, switch (s) { case MESA_SHADER_VERTEX: - anv_pipeline_compile_vs(compiler, stage_ctx, pipeline->base.device, + anv_pipeline_compile_vs(compiler, stage_ctx, pipeline, &stages[s]); break; case MESA_SHADER_TESS_CTRL: @@ -2015,7 +2061,7 @@ anv_pipeline_init(struct anv_graphics_pipeline *pipeline, * the instance divisor by the number of views ensure that we repeat the * client's per-instance data once for each view. */ - if (pipeline->subpass->view_mask) { + if (pipeline->subpass->view_mask && !pipeline->use_primitive_replication) { const uint32_t view_count = anv_subpass_view_count(pipeline->subpass); for (uint32_t vb = 0; vb < MAX_VBS; vb++) { if (pipeline->vb[vb].instanced) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index d7134101c20..51f1ae823fc 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -174,6 +174,7 @@ struct gen_perf_config; #define MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS 32 #define ANV_UBO_BOUNDS_CHECK_ALIGNMENT 32 #define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4 +#define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16 /* From the Skylake PRM Vol. 7 "Binding Table Surface State Model": * @@ -3207,6 +3208,11 @@ struct anv_graphics_pipeline { bool kill_pixel; bool depth_bounds_test_enable; + /* When primitive replication is used, subpass->view_mask will describe what + * views to replicate. + */ + bool use_primitive_replication; + struct anv_state blend_state; uint32_t vb_used; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 2c5a448aff3..7af1da0f5e4 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -3516,7 +3516,8 @@ void genX(CmdDraw)( /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ - instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + if (!pipeline->use_primitive_replication) + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; @@ -3566,7 +3567,8 @@ void genX(CmdDrawIndexed)( /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ - instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + if (!pipeline->use_primitive_replication) + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; @@ -3627,7 +3629,8 @@ void genX(CmdDrawIndirectByteCountEXT)( /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ - instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); + if (!pipeline->use_primitive_replication) + instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass); struct gen_mi_builder b; gen_mi_builder_init(&b, &cmd_buffer->batch); diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index 50d0894b93c..504c70f3ef3 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -220,6 +220,11 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, vf.StatisticsEnable = false; } +#if GEN_GEN >= 12 + /* Disable Primitive Replication. */ + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); +#endif + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = _3DPRIM_POINTLIST; diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 70d77b4cd64..2c1d7545b72 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -2106,6 +2106,32 @@ compute_kill_pixel(struct anv_graphics_pipeline *pipeline, (ms_info && ms_info->alphaToCoverageEnable); } +#if GEN_GEN == 12 +static void +emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline) +{ + if (!pipeline->use_primitive_replication) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); + return; + } + + uint32_t view_mask = pipeline->subpass->view_mask; + int view_count = util_bitcount(view_mask); + assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION); + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { + pr.ReplicaMask = (1 << view_count) - 1; + pr.ReplicationCount = view_count - 1; + + int i = 0, view_index; + for_each_bit(view_index, view_mask) { + pr.RTAIOffset[i] = view_index; + i++; + } + } +} +#endif + static VkResult genX(graphics_pipeline_create)( VkDevice _device, @@ -2181,6 +2207,10 @@ genX(graphics_pipeline_create)( pCreateInfo->pRasterizationState); emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState); +#if GEN_GEN == 12 + emit_3dstate_primitive_replication(pipeline); +#endif + #if 0 /* From gen7_vs_state.c */ -- 2.30.2