shader_variant_flags |= 1 << 1;
if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
shader_variant_flags |= 1 << 2;
- if (sel->info.stage == MESA_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.base.fs.uses_discard &&
+ if (sel->info.stage == MESA_SHADER_FRAGMENT &&
+ /* Derivatives imply helper invocations so check for needs_helper_invocations. */
+ sel->info.base.fs.needs_helper_invocations &&
+ sel->info.base.fs.uses_discard &&
sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
shader_variant_flags |= 1 << 3;
unsigned gs_num_invocations = sel->info.base.gs.invocations;
struct si_pm4_state *pm4;
uint64_t va;
- unsigned max_stream = sel->max_gs_stream;
+ unsigned max_stream = util_last_bit(sel->info.base.gs.active_stream_mask);
unsigned offset;
pm4 = si_get_shader_pm4_state(shader);
offset = num_components[0] * sel->info.base.gs.vertices_out;
shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
- if (max_stream >= 1)
+ if (max_stream >= 2)
offset += num_components[1] * sel->info.base.gs.vertices_out;
shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
- if (max_stream >= 2)
+ if (max_stream >= 3)
offset += num_components[2] * sel->info.base.gs.vertices_out;
shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
- if (max_stream >= 3)
+ if (max_stream >= 4)
offset += num_components[3] * sel->info.base.gs.vertices_out;
shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->info.base.gs.vertices_out;
shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 2) ? num_components[1] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 3) ? num_components[2] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 4) ? num_components[3] : 0;
shader->ctx_reg.gs.vgt_gs_instance_cnt =
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
}
-static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ngg)
+static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
+ const struct si_shader *shader, bool ngg)
{
- bool misc_vec_ena = sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
+ bool writes_psize = sel->info.writes_psize;
+
+ if (shader)
+ writes_psize &= !shader->key.opt.kill_pointsize;
+
+ bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) ||
sel->info.writes_layer || sel->info.writes_viewport_index;
- return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
+ return S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_stage == MESA_SHADER_VERTEX) |
/* Reuse for NGG. */
S_028838_VERTEX_REUSE_DEPTH(sscreen->info.chip_class >= GFX10_3 ? 30 : 0);
- shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
+ shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
/* Oversubscribe PC. This improves performance when there are too many varyings. */
float oversub_pc_factor = 0.25;
: V_02870C_SPI_SHADER_NONE);
shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
- shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
+ shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false);
oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
key->opt.kill_outputs = ~linked & outputs_written;
key->opt.ngg_culling = sctx->ngg_culling;
+
+ if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+ key->mono.u.vs_export_prim_id = 1;
+
+ /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */
+ if (sctx->chip_class >= GFX10 &&
+ vs->info.writes_psize &&
+ sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+ !sctx->queued.named.rasterizer->polygon_mode_is_points)
+ key->opt.kill_pointsize = 1;
}
/* Compute the key for the hw shader variant */
} else {
key->as_ngg = stages_key.u.ngg;
si_shader_selector_key_hw_vs(sctx, sel, key);
-
- if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->mono.u.vs_export_prim_id = 1;
}
break;
case MESA_SHADER_TESS_CTRL:
key->as_es = 1;
else {
si_shader_selector_key_hw_vs(sctx, sel, key);
-
- if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->mono.u.vs_export_prim_id = 1;
}
break;
case MESA_SHADER_GEOMETRY:
sel->gsvs_vertex_size = sel->info.num_outputs * 16;
sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->info.base.gs.vertices_out;
-
- sel->max_gs_stream = 0;
- for (i = 0; i < sel->so.num_outputs; i++)
- sel->max_gs_stream = MAX2(sel->max_gs_stream, sel->so.output[i].stream);
-
sel->gs_input_verts_per_prim =
u_vertices_per_prim(sel->info.base.gs.input_primitive);
/* PA_CL_VS_OUT_CNTL */
if (sctx->chip_class <= GFX9)
- sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
+ sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false);
sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS :
u_bit_consecutive(0, sel->info.base.clip_distance_array_size);