turnip: Configure VPC for geometry shaders
authorBrian Ho <brian@brkho.com>
Wed, 1 Apr 2020 18:50:55 +0000 (11:50 -0700)
committerMarge Bot <eric+marge@anholt.net>
Tue, 7 Apr 2020 14:13:20 +0000 (14:13 +0000)
This commit updates tu6_emit_vpc to selectively emit GS-specifc
configuration. Most of this is repurposed from fd6_program.c.

This also refactors `link_geometry_stages` to ir3_nir_lower_tess.c
so it can be shared between fd and tu.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4436>

src/freedreno/ir3/ir3_nir.h
src/freedreno/ir3/ir3_nir_lower_tess.c
src/freedreno/vulkan/tu_pipeline.c
src/gallium/drivers/freedreno/ir3/ir3_gallium.c

index a42c8822b4a96531cdec279647a7f0fde96bb1f2..cebc9e64dd3a46e0527b8575dec9d62ca600521a 100644 (file)
@@ -60,4 +60,8 @@ bool ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader);
 nir_ssa_def *
 ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift);
 
+uint32_t ir3_link_geometry_stages(const struct ir3_shader_variant *producer,
+               const struct ir3_shader_variant *consumer,
+               uint32_t *locs);
+
 #endif /* IR3_NIR_H_ */
index b60b0b0b3d718a961b3a9ae9b9efe1b1aa8d813f..2f2abc10bbbfd9f1084d1faf8d8bd7dc77746719 100644 (file)
@@ -956,3 +956,39 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
                nir_print_shader(shader, stderr);
        }
 }
+
+uint32_t
+ir3_link_geometry_stages(const struct ir3_shader_variant *producer,
+               const struct ir3_shader_variant *consumer,
+               uint32_t *locs)
+{
+       uint32_t num_loc = 0, factor;
+
+       switch (consumer->type) {
+       case MESA_SHADER_TESS_CTRL:
+       case MESA_SHADER_GEOMETRY:
+               /* These stages load with ldlw, which expects byte offsets. */
+               factor = 4;
+               break;
+       case MESA_SHADER_TESS_EVAL:
+               /* The tess eval shader uses ldg, which takes dword offsets. */
+               factor = 1;
+               break;
+       default:
+               unreachable("bad shader stage");
+       }
+
+       nir_foreach_variable(in_var, &consumer->shader->nir->inputs) {
+               nir_foreach_variable(out_var, &producer->shader->nir->outputs) {
+                       if (in_var->data.location == out_var->data.location) {
+                               locs[in_var->data.driver_location] =
+                                       producer->shader->output_loc[out_var->data.driver_location] * factor;
+
+                               debug_assert(num_loc <= in_var->data.driver_location + 1);
+                               num_loc = in_var->data.driver_location + 1;
+                       }
+               }
+       }
+
+       return num_loc;
+}
index b004b7e5bc274beea465c1eeced403fe610ca922..d82950be2443eef9c6cd1d6c987a7dceda61541a 100644 (file)
@@ -27,6 +27,7 @@
 
 #include "tu_private.h"
 
+#include "ir3/ir3_nir.h"
 #include "main/menums.h"
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
@@ -686,18 +687,51 @@ tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
    tu_cs_emit_array(cs, dwords, size);
 }
 
+static void
+tu6_emit_link_map(struct tu_cs *cs,
+                  const struct ir3_shader_variant *producer,
+                  const struct ir3_shader_variant *consumer) {
+   const struct ir3_const_state *const_state = &consumer->shader->const_state;
+   uint32_t base = const_state->offsets.primitive_map;
+   uint32_t patch_locs[MAX_VARYING] = { }, num_loc;
+   num_loc = ir3_link_geometry_stages(producer, consumer, patch_locs);
+   int size = DIV_ROUND_UP(num_loc, 4);
+
+   size = (MIN2(size + base, consumer->constlen) - base) * 4;
+
+   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0, size,
+                  patch_locs);
+}
+
+static uint16_t
+gl_primitive_to_tess(uint16_t primitive) {
+   switch (primitive) {
+   case GL_POINTS:
+      return TESS_POINTS;
+   case GL_LINE_STRIP:
+      return TESS_LINES;
+   case GL_TRIANGLE_STRIP:
+      return TESS_CW_TRIS;
+   default:
+      unreachable("");
+   }
+}
+
 static void
 tu6_emit_vpc(struct tu_cs *cs,
              const struct ir3_shader_variant *vs,
+             const struct ir3_shader_variant *gs,
              const struct ir3_shader_variant *fs,
              bool binning_pass,
              struct tu_streamout_state *tf)
 {
+   bool has_gs = gs->type != MESA_SHADER_NONE;
+   const struct ir3_shader_variant *last_shader = has_gs ? gs : vs;
    struct ir3_shader_linkage linkage = { 0 };
-   ir3_link_shaders(&linkage, vs, fs);
+   ir3_link_shaders(&linkage, last_shader, fs);
 
-   if (vs->shader->stream_output.num_outputs)
-      tu6_link_streamout(&linkage, vs);
+   if (last_shader->shader->stream_output.num_outputs)
+      tu6_link_streamout(&linkage, last_shader);
 
    BITSET_DECLARE(vpc_var_enables, 128) = { 0 };
    for (uint32_t i = 0; i < linkage.cnt; i++) {
@@ -714,10 +748,17 @@ tu6_emit_vpc(struct tu_cs *cs,
 
    /* a6xx finds position/pointsize at the end */
    const uint32_t position_regid =
-      ir3_find_output_regid(vs, VARYING_SLOT_POS);
+      ir3_find_output_regid(last_shader, VARYING_SLOT_POS);
    const uint32_t pointsize_regid =
-      ir3_find_output_regid(vs, VARYING_SLOT_PSIZ);
-   uint32_t pointsize_loc = 0xff, position_loc = 0xff;
+      ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
+   const uint32_t layer_regid = has_gs ?
+      ir3_find_output_regid(gs, VARYING_SLOT_LAYER) : regid(63, 0);
+
+   uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff;
+   if (layer_regid != regid(63, 0)) {
+      layer_loc = linkage.max_loc;
+      ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc);
+   }
    if (position_regid != regid(63, 0)) {
       position_loc = linkage.max_loc;
       ir3_link_add(&linkage, position_regid, 0xf, linkage.max_loc);
@@ -727,30 +768,34 @@ tu6_emit_vpc(struct tu_cs *cs,
       ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
    }
 
-   if (vs->shader->stream_output.num_outputs)
-      tu6_setup_streamout(vs, &linkage, tf);
+   if (last_shader->shader->stream_output.num_outputs)
+      tu6_setup_streamout(last_shader, &linkage, tf);
 
-   /* map vs outputs to VPC */
+   /* map outputs of the last shader to VPC */
    assert(linkage.cnt <= 32);
-   const uint32_t sp_vs_out_count = (linkage.cnt + 1) / 2;
-   const uint32_t sp_vs_vpc_dst_count = (linkage.cnt + 3) / 4;
-   uint32_t sp_vs_out[16];
-   uint32_t sp_vs_vpc_dst[8];
-   sp_vs_out[sp_vs_out_count - 1] = 0;
-   sp_vs_vpc_dst[sp_vs_vpc_dst_count - 1] = 0;
+   const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
+   const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
+   uint32_t sp_out[16];
+   uint32_t sp_vpc_dst[8];
    for (uint32_t i = 0; i < linkage.cnt; i++) {
-      ((uint16_t *) sp_vs_out)[i] =
+      ((uint16_t *) sp_out)[i] =
          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
-      ((uint8_t *) sp_vs_vpc_dst)[i] =
+      ((uint8_t *) sp_vpc_dst)[i] =
          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
    }
 
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OUT_REG(0), sp_vs_out_count);
-   tu_cs_emit_array(cs, sp_vs_out, sp_vs_out_count);
+   if (has_gs)
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_OUT_REG(0), sp_out_count);
+   else
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OUT_REG(0), sp_out_count);
+   tu_cs_emit_array(cs, sp_out, sp_out_count);
 
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_VPC_DST_REG(0), sp_vs_vpc_dst_count);
-   tu_cs_emit_array(cs, sp_vs_vpc_dst, sp_vs_vpc_dst_count);
+   if (has_gs)
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_VPC_DST_REG(0), sp_vpc_dst_count);
+   else
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_VPC_DST_REG(0), sp_vpc_dst_count);
+   tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
@@ -762,12 +807,84 @@ tu6_emit_vpc(struct tu_cs *cs,
                      A6XX_VPC_PACK_PSIZELOC(pointsize_loc) |
                      A6XX_VPC_PACK_STRIDE_IN_VPC(linkage.max_loc));
 
+   if (has_gs) {
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CTRL_REG0, 1);
+      tu_cs_emit(cs, A6XX_SP_GS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
+            A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) |
+            A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(gs->branchstack) |
+            COND(gs->need_pixlod, A6XX_SP_GS_CTRL_REG0_PIXLODENABLE));
+
+      tu6_emit_link_map(cs, vs, gs);
+
+      uint32_t primitive_regid =
+            ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID);
+      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_PACK_GS, 1);
+      tu_cs_emit(cs, A6XX_VPC_PACK_GS_POSITIONLOC(position_loc) |
+             A6XX_VPC_PACK_GS_PSIZELOC(pointsize_loc) |
+             A6XX_VPC_PACK_GS_STRIDE_IN_VPC(linkage.max_loc));
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9105, 1);
+      tu_cs_emit(cs, A6XX_VPC_UNKNOWN_9105_LAYERLOC(layer_loc) | 0xff00);
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_809C, 1);
+      tu_cs_emit(cs, CONDREG(layer_regid,
+            A6XX_GRAS_UNKNOWN_809C_GS_WRITES_LAYER));
+
+      uint32_t flags_regid = ir3_find_output_regid(gs,
+            VARYING_SLOT_GS_VERTEX_FLAGS_IR3);
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_PRIMITIVE_CNTL_GS, 1);
+      tu_cs_emit(cs, A6XX_SP_PRIMITIVE_CNTL_GS_GSOUT(linkage.cnt) |
+            A6XX_SP_PRIMITIVE_CNTL_GS_FLAGS_REGID(flags_regid));
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_2, 1);
+      tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_2_STRIDE_IN_VPC(linkage.max_loc) |
+            CONDREG(pointsize_regid, A6XX_PC_PRIMITIVE_CNTL_2_PSIZE) |
+            CONDREG(layer_regid, A6XX_PC_PRIMITIVE_CNTL_2_LAYER) |
+            CONDREG(primitive_regid, A6XX_PC_PRIMITIVE_CNTL_2_PRIMITIVE_ID));
+
+      uint32_t vertices_out = gs->shader->nir->info.gs.vertices_out - 1;
+      uint16_t output =
+            gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive);
+      uint32_t invocations = gs->shader->nir->info.gs.invocations - 1;
+      tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
+      tu_cs_emit(cs,
+            A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
+            A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
+            A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1);
+      tu_cs_emit(cs, 0);
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8003, 1);
+      tu_cs_emit(cs, 0);
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9100, 1);
+      tu_cs_emit(cs, 0xff);
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9102, 1);
+      tu_cs_emit(cs, 0xffff00);
+
+      /* Size of per-primitive alloction in ldlw memory in vec4s. */
+      uint32_t vec4_size =
+         gs->shader->nir->info.gs.vertices_in *
+         DIV_ROUND_UP(vs->shader->output_size, 4);
+      tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
+      tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_PC_UNKNOWN_9B07, 1);
+      tu_cs_emit(cs, 0);
+
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_UNKNOWN_A871, 1);
+      tu_cs_emit(cs, vs->shader->output_size);
+   }
+
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PRIMITIVE_CNTL, 1);
    tu_cs_emit(cs, A6XX_SP_PRIMITIVE_CNTL_VSOUT(linkage.cnt));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1);
    tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(linkage.max_loc) |
-                     (vs->writes_psize ? A6XX_PC_PRIMITIVE_CNTL_1_PSIZE : 0));
+         (last_shader->writes_psize ? A6XX_PC_PRIMITIVE_CNTL_1_PSIZE : 0));
 }
 
 static int
@@ -1227,7 +1344,7 @@ tu6_emit_program(struct tu_cs *cs,
    tu6_emit_fs_config(cs, builder->shaders[MESA_SHADER_FRAGMENT], fs);
 
    tu6_emit_vs_system_values(cs, vs);
-   tu6_emit_vpc(cs, vs, fs, binning_pass, tf);
+   tu6_emit_vpc(cs, vs, gs, fs, binning_pass, tf);
    tu6_emit_vpc_varying_modes(cs, fs, binning_pass);
    tu6_emit_fs_inputs(cs, fs);
    tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count);
index 31dc82b1b6e445cc62ef00dde470afa8263ea459..56972a81b9d55eab1d0e27b966d70148908bf27e 100644 (file)
@@ -412,42 +412,6 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v
        }
 }
 
-static uint32_t
-link_geometry_stages(const struct ir3_shader_variant *producer,
-               const struct ir3_shader_variant *consumer,
-               uint32_t *locs)
-{
-       uint32_t num_loc = 0, factor;
-
-       switch (consumer->type) {
-       case MESA_SHADER_TESS_CTRL:
-       case MESA_SHADER_GEOMETRY:
-               /* These stages load with ldlw, which expects byte offsets. */
-               factor = 4;
-               break;
-       case MESA_SHADER_TESS_EVAL:
-               /* The tess eval shader uses ldg, which takes dword offsets. */
-               factor = 1;
-               break;
-       default:
-               unreachable("bad shader stage");
-       }
-
-       nir_foreach_variable(in_var, &consumer->shader->nir->inputs) {
-               nir_foreach_variable(out_var, &producer->shader->nir->outputs) {
-                       if (in_var->data.location == out_var->data.location) {
-                               locs[in_var->data.driver_location] =
-                                       producer->shader->output_loc[out_var->data.driver_location] * factor;
-
-                               debug_assert(num_loc <= in_var->data.driver_location + 1);
-                               num_loc = in_var->data.driver_location + 1;
-                       }
-               }
-       }
-
-       return num_loc;
-}
-
 void
 ir3_emit_link_map(struct fd_screen *screen,
                const struct ir3_shader_variant *producer,
@@ -457,7 +421,7 @@ ir3_emit_link_map(struct fd_screen *screen,
        uint32_t base = const_state->offsets.primitive_map;
        uint32_t patch_locs[MAX_VARYING] = { }, num_loc;
 
-       num_loc = link_geometry_stages(producer, v, patch_locs);
+       num_loc = ir3_link_geometry_stages(producer, v, patch_locs);
 
        int size = DIV_ROUND_UP(num_loc, 4);