From 012773be26aafb71ab232a5838d8e5e7dcc3dc55 Mon Sep 17 00:00:00 2001 From: Brian Ho Date: Wed, 1 Apr 2020 11:50:55 -0700 Subject: [PATCH] turnip: Configure VPC for geometry shaders This commit updates tu6_emit_vpc to selectively emit GS-specifc configuration. Most of this is repurposed from fd6_program.c. This also refactors `link_geometry_stages` to ir3_nir_lower_tess.c so it can be shared between fd and tu. Part-of: --- src/freedreno/ir3/ir3_nir.h | 4 + src/freedreno/ir3/ir3_nir_lower_tess.c | 36 ++++ src/freedreno/vulkan/tu_pipeline.c | 163 +++++++++++++++--- .../drivers/freedreno/ir3/ir3_gallium.c | 38 +--- 4 files changed, 181 insertions(+), 60 deletions(-) diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index a42c8822b4a..cebc9e64dd3 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -60,4 +60,8 @@ bool ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader); nir_ssa_def * ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift); +uint32_t ir3_link_geometry_stages(const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *consumer, + uint32_t *locs); + #endif /* IR3_NIR_H_ */ diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c index b60b0b0b3d7..2f2abc10bbb 100644 --- a/src/freedreno/ir3/ir3_nir_lower_tess.c +++ b/src/freedreno/ir3/ir3_nir_lower_tess.c @@ -956,3 +956,39 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s) nir_print_shader(shader, stderr); } } + +uint32_t +ir3_link_geometry_stages(const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *consumer, + uint32_t *locs) +{ + uint32_t num_loc = 0, factor; + + switch (consumer->type) { + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_GEOMETRY: + /* These stages load with ldlw, which expects byte offsets. */ + factor = 4; + break; + case MESA_SHADER_TESS_EVAL: + /* The tess eval shader uses ldg, which takes dword offsets. */ + factor = 1; + break; + default: + unreachable("bad shader stage"); + } + + nir_foreach_variable(in_var, &consumer->shader->nir->inputs) { + nir_foreach_variable(out_var, &producer->shader->nir->outputs) { + if (in_var->data.location == out_var->data.location) { + locs[in_var->data.driver_location] = + producer->shader->output_loc[out_var->data.driver_location] * factor; + + debug_assert(num_loc <= in_var->data.driver_location + 1); + num_loc = in_var->data.driver_location + 1; + } + } + } + + return num_loc; +} diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index b004b7e5bc2..d82950be244 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -27,6 +27,7 @@ #include "tu_private.h" +#include "ir3/ir3_nir.h" #include "main/menums.h" #include "nir/nir.h" #include "nir/nir_builder.h" @@ -686,18 +687,51 @@ tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, tu_cs_emit_array(cs, dwords, size); } +static void +tu6_emit_link_map(struct tu_cs *cs, + const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *consumer) { + const struct ir3_const_state *const_state = &consumer->shader->const_state; + uint32_t base = const_state->offsets.primitive_map; + uint32_t patch_locs[MAX_VARYING] = { }, num_loc; + num_loc = ir3_link_geometry_stages(producer, consumer, patch_locs); + int size = DIV_ROUND_UP(num_loc, 4); + + size = (MIN2(size + base, consumer->constlen) - base) * 4; + + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0, size, + patch_locs); +} + +static uint16_t +gl_primitive_to_tess(uint16_t primitive) { + switch (primitive) { + case GL_POINTS: + return TESS_POINTS; + case GL_LINE_STRIP: + return TESS_LINES; + case GL_TRIANGLE_STRIP: + return TESS_CW_TRIS; + default: + unreachable(""); + } +} + static void tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *vs, + const struct ir3_shader_variant *gs, const struct ir3_shader_variant *fs, bool binning_pass, struct tu_streamout_state *tf) { + bool has_gs = gs->type != MESA_SHADER_NONE; + const struct ir3_shader_variant *last_shader = has_gs ? gs : vs; struct ir3_shader_linkage linkage = { 0 }; - ir3_link_shaders(&linkage, vs, fs); + ir3_link_shaders(&linkage, last_shader, fs); - if (vs->shader->stream_output.num_outputs) - tu6_link_streamout(&linkage, vs); + if (last_shader->shader->stream_output.num_outputs) + tu6_link_streamout(&linkage, last_shader); BITSET_DECLARE(vpc_var_enables, 128) = { 0 }; for (uint32_t i = 0; i < linkage.cnt; i++) { @@ -714,10 +748,17 @@ tu6_emit_vpc(struct tu_cs *cs, /* a6xx finds position/pointsize at the end */ const uint32_t position_regid = - ir3_find_output_regid(vs, VARYING_SLOT_POS); + ir3_find_output_regid(last_shader, VARYING_SLOT_POS); const uint32_t pointsize_regid = - ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); - uint32_t pointsize_loc = 0xff, position_loc = 0xff; + ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); + const uint32_t layer_regid = has_gs ? + ir3_find_output_regid(gs, VARYING_SLOT_LAYER) : regid(63, 0); + + uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff; + if (layer_regid != regid(63, 0)) { + layer_loc = linkage.max_loc; + ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc); + } if (position_regid != regid(63, 0)) { position_loc = linkage.max_loc; ir3_link_add(&linkage, position_regid, 0xf, linkage.max_loc); @@ -727,30 +768,34 @@ tu6_emit_vpc(struct tu_cs *cs, ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); } - if (vs->shader->stream_output.num_outputs) - tu6_setup_streamout(vs, &linkage, tf); + if (last_shader->shader->stream_output.num_outputs) + tu6_setup_streamout(last_shader, &linkage, tf); - /* map vs outputs to VPC */ + /* map outputs of the last shader to VPC */ assert(linkage.cnt <= 32); - const uint32_t sp_vs_out_count = (linkage.cnt + 1) / 2; - const uint32_t sp_vs_vpc_dst_count = (linkage.cnt + 3) / 4; - uint32_t sp_vs_out[16]; - uint32_t sp_vs_vpc_dst[8]; - sp_vs_out[sp_vs_out_count - 1] = 0; - sp_vs_vpc_dst[sp_vs_vpc_dst_count - 1] = 0; + const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2); + const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4); + uint32_t sp_out[16]; + uint32_t sp_vpc_dst[8]; for (uint32_t i = 0; i < linkage.cnt; i++) { - ((uint16_t *) sp_vs_out)[i] = + ((uint16_t *) sp_out)[i] = A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); - ((uint8_t *) sp_vs_vpc_dst)[i] = + ((uint8_t *) sp_vpc_dst)[i] = A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); } - tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OUT_REG(0), sp_vs_out_count); - tu_cs_emit_array(cs, sp_vs_out, sp_vs_out_count); + if (has_gs) + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_OUT_REG(0), sp_out_count); + else + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OUT_REG(0), sp_out_count); + tu_cs_emit_array(cs, sp_out, sp_out_count); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_VPC_DST_REG(0), sp_vs_vpc_dst_count); - tu_cs_emit_array(cs, sp_vs_vpc_dst, sp_vs_vpc_dst_count); + if (has_gs) + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_VPC_DST_REG(0), sp_vpc_dst_count); + else + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_VPC_DST_REG(0), sp_vpc_dst_count); + tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count); tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | @@ -762,12 +807,84 @@ tu6_emit_vpc(struct tu_cs *cs, A6XX_VPC_PACK_PSIZELOC(pointsize_loc) | A6XX_VPC_PACK_STRIDE_IN_VPC(linkage.max_loc)); + if (has_gs) { + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CTRL_REG0, 1); + tu_cs_emit(cs, A6XX_SP_GS_CTRL_REG0_THREADSIZE(TWO_QUADS) | + A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) | + A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(gs->branchstack) | + COND(gs->need_pixlod, A6XX_SP_GS_CTRL_REG0_PIXLODENABLE)); + + tu6_emit_link_map(cs, vs, gs); + + uint32_t primitive_regid = + ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_PACK_GS, 1); + tu_cs_emit(cs, A6XX_VPC_PACK_GS_POSITIONLOC(position_loc) | + A6XX_VPC_PACK_GS_PSIZELOC(pointsize_loc) | + A6XX_VPC_PACK_GS_STRIDE_IN_VPC(linkage.max_loc)); + + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9105, 1); + tu_cs_emit(cs, A6XX_VPC_UNKNOWN_9105_LAYERLOC(layer_loc) | 0xff00); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_809C, 1); + tu_cs_emit(cs, CONDREG(layer_regid, + A6XX_GRAS_UNKNOWN_809C_GS_WRITES_LAYER)); + + uint32_t flags_regid = ir3_find_output_regid(gs, + VARYING_SLOT_GS_VERTEX_FLAGS_IR3); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_PRIMITIVE_CNTL_GS, 1); + tu_cs_emit(cs, A6XX_SP_PRIMITIVE_CNTL_GS_GSOUT(linkage.cnt) | + A6XX_SP_PRIMITIVE_CNTL_GS_FLAGS_REGID(flags_regid)); + + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_2, 1); + tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_2_STRIDE_IN_VPC(linkage.max_loc) | + CONDREG(pointsize_regid, A6XX_PC_PRIMITIVE_CNTL_2_PSIZE) | + CONDREG(layer_regid, A6XX_PC_PRIMITIVE_CNTL_2_LAYER) | + CONDREG(primitive_regid, A6XX_PC_PRIMITIVE_CNTL_2_PRIMITIVE_ID)); + + uint32_t vertices_out = gs->shader->nir->info.gs.vertices_out - 1; + uint16_t output = + gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive); + uint32_t invocations = gs->shader->nir->info.gs.invocations - 1; + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); + tu_cs_emit(cs, + A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); + + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1); + tu_cs_emit(cs, 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8003, 1); + tu_cs_emit(cs, 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9100, 1); + tu_cs_emit(cs, 0xff); + + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9102, 1); + tu_cs_emit(cs, 0xffff00); + + /* Size of per-primitive alloction in ldlw memory in vec4s. */ + uint32_t vec4_size = + gs->shader->nir->info.gs.vertices_in * + DIV_ROUND_UP(vs->shader->output_size, 4); + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); + + tu_cs_emit_pkt4(cs, REG_A6XX_PC_UNKNOWN_9B07, 1); + tu_cs_emit(cs, 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_UNKNOWN_A871, 1); + tu_cs_emit(cs, vs->shader->output_size); + } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_PRIMITIVE_CNTL, 1); tu_cs_emit(cs, A6XX_SP_PRIMITIVE_CNTL_VSOUT(linkage.cnt)); tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1); tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(linkage.max_loc) | - (vs->writes_psize ? A6XX_PC_PRIMITIVE_CNTL_1_PSIZE : 0)); + (last_shader->writes_psize ? A6XX_PC_PRIMITIVE_CNTL_1_PSIZE : 0)); } static int @@ -1227,7 +1344,7 @@ tu6_emit_program(struct tu_cs *cs, tu6_emit_fs_config(cs, builder->shaders[MESA_SHADER_FRAGMENT], fs); tu6_emit_vs_system_values(cs, vs); - tu6_emit_vpc(cs, vs, fs, binning_pass, tf); + tu6_emit_vpc(cs, vs, gs, fs, binning_pass, tf); tu6_emit_vpc_varying_modes(cs, fs, binning_pass); tu6_emit_fs_inputs(cs, fs); tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 31dc82b1b6e..56972a81b9d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -412,42 +412,6 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v } } -static uint32_t -link_geometry_stages(const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *consumer, - uint32_t *locs) -{ - uint32_t num_loc = 0, factor; - - switch (consumer->type) { - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_GEOMETRY: - /* These stages load with ldlw, which expects byte offsets. */ - factor = 4; - break; - case MESA_SHADER_TESS_EVAL: - /* The tess eval shader uses ldg, which takes dword offsets. */ - factor = 1; - break; - default: - unreachable("bad shader stage"); - } - - nir_foreach_variable(in_var, &consumer->shader->nir->inputs) { - nir_foreach_variable(out_var, &producer->shader->nir->outputs) { - if (in_var->data.location == out_var->data.location) { - locs[in_var->data.driver_location] = - producer->shader->output_loc[out_var->data.driver_location] * factor; - - debug_assert(num_loc <= in_var->data.driver_location + 1); - num_loc = in_var->data.driver_location + 1; - } - } - } - - return num_loc; -} - void ir3_emit_link_map(struct fd_screen *screen, const struct ir3_shader_variant *producer, @@ -457,7 +421,7 @@ ir3_emit_link_map(struct fd_screen *screen, uint32_t base = const_state->offsets.primitive_map; uint32_t patch_locs[MAX_VARYING] = { }, num_loc; - num_loc = link_geometry_stages(producer, v, patch_locs); + num_loc = ir3_link_geometry_stages(producer, v, patch_locs); int size = DIV_ROUND_UP(num_loc, 4); -- 2.30.2