From: Hyunjun Ko Date: Tue, 25 Feb 2020 01:07:25 +0000 (+0900) Subject: turnip: Setup stream-output when linking program X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=374406a7c420d266f920461f904864a94dc1b8c8;p=mesa.git turnip: Setup stream-output when linking program Mostly taken from fd6_program.c. v2. Note that it forces to use full VS instead of binning pass VS if there's stream output as the binning pass VS will have outputs on other than position/psize stripped out, which is the same as freedreno. v3. fix indentation. v4. Use register index instead of location when setup streamout. Signed-off-by: Hyunjun Ko Reviewed-by: Jonathan Marek Part-of: --- diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index eb920ec9172..c739ad98e47 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -562,17 +562,122 @@ tu6_emit_vs_system_values(struct tu_cs *cs, tu_cs_emit(cs, 0x00000000); /* VFD_CONTROL_6 */ } +/* Add any missing varyings needed for stream-out. Otherwise varyings not + * used by fragment shader will be stripped out. + */ +static void +tu6_link_streamout(struct ir3_shader_linkage *l, + const struct ir3_shader_variant *v) +{ + const struct ir3_stream_output_info *info = &v->shader->stream_output; + + /* + * First, any stream-out varyings not already in linkage map (ie. also + * consumed by frag shader) need to be added: + */ + for (unsigned i = 0; i < info->num_outputs; i++) { + const struct ir3_stream_output *out = &info->output[i]; + unsigned compmask = + (1 << (out->num_components + out->start_component)) - 1; + unsigned k = out->register_index; + unsigned idx, nextloc = 0; + + /* psize/pos need to be the last entries in linkage map, and will + * get added link_stream_out, so skip over them: + */ + if (v->outputs[k].slot == VARYING_SLOT_PSIZ || + v->outputs[k].slot == VARYING_SLOT_POS) + continue; + + for (idx = 0; idx < l->cnt; idx++) { + if (l->var[idx].regid == v->outputs[k].regid) + break; + nextloc = MAX2(nextloc, l->var[idx].loc + 4); + } + + /* add if not already in linkage map: */ + if (idx == l->cnt) + ir3_link_add(l, v->outputs[k].regid, compmask, nextloc); + + /* expand component-mask if needed, ie streaming out all components + * but frag shader doesn't consume all components: + */ + if (compmask & ~l->var[idx].compmask) { + l->var[idx].compmask |= compmask; + l->max_loc = MAX2(l->max_loc, l->var[idx].loc + + util_last_bit(l->var[idx].compmask)); + } + } +} + +static void +tu6_setup_streamout(const struct ir3_shader_variant *v, + struct ir3_shader_linkage *l, struct tu_streamout_state *tf) +{ + const struct ir3_stream_output_info *info = &v->shader->stream_output; + + memset(tf, 0, sizeof(*tf)); + + tf->prog_count = align(l->max_loc, 2) / 2; + + debug_assert(tf->prog_count < ARRAY_SIZE(tf->prog)); + + /* set stride info to the streamout state */ + for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) + tf->stride[i] = info->stride[i]; + + for (unsigned i = 0; i < info->num_outputs; i++) { + const struct ir3_stream_output *out = &info->output[i]; + unsigned k = out->register_index; + unsigned idx; + + tf->ncomp[out->output_buffer] += out->num_components; + + /* linkage map sorted by order frag shader wants things, so + * a bit less ideal here.. + */ + for (idx = 0; idx < l->cnt; idx++) + if (l->var[idx].regid == v->outputs[k].regid) + break; + + debug_assert(idx < l->cnt); + + for (unsigned j = 0; j < out->num_components; j++) { + unsigned c = j + out->start_component; + unsigned loc = l->var[idx].loc + c; + unsigned off = j + out->dst_offset; /* in dwords */ + + if (loc & 1) { + tf->prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN | + A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | + A6XX_VPC_SO_PROG_B_OFF(off * 4); + } else { + tf->prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN | + A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | + A6XX_VPC_SO_PROG_A_OFF(off * 4); + } + } + } + + tf->vpc_so_buf_cntl = A6XX_VPC_SO_BUF_CNTL_ENABLE | + COND(tf->ncomp[0] > 0, A6XX_VPC_SO_BUF_CNTL_BUF0) | + COND(tf->ncomp[1] > 0, A6XX_VPC_SO_BUF_CNTL_BUF1) | + COND(tf->ncomp[2] > 0, A6XX_VPC_SO_BUF_CNTL_BUF2) | + COND(tf->ncomp[3] > 0, A6XX_VPC_SO_BUF_CNTL_BUF3); +} + static void tu6_emit_vpc(struct tu_cs *cs, const struct ir3_shader_variant *vs, const struct ir3_shader_variant *fs, - bool binning_pass) + bool binning_pass, + struct tu_streamout_state *tf) { struct ir3_shader_linkage linkage = { 0 }; ir3_link_shaders(&linkage, vs, fs); - if (vs->shader->stream_output.num_outputs && !binning_pass) - tu_finishme("stream output"); + if (vs->shader->stream_output.num_outputs) + tu6_link_streamout(&linkage, vs); BITSET_DECLARE(vpc_var_enables, 128) = { 0 }; for (uint32_t i = 0; i < linkage.cnt; i++) { @@ -602,6 +707,9 @@ tu6_emit_vpc(struct tu_cs *cs, ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); } + if (vs->shader->stream_output.num_outputs) + tu6_setup_streamout(vs, &linkage, tf); + /* map vs outputs to VPC */ assert(linkage.cnt <= 32); const uint32_t sp_vs_out_count = (linkage.cnt + 1) / 2; @@ -1034,7 +1142,8 @@ static void tu6_emit_program(struct tu_cs *cs, const struct tu_pipeline_builder *builder, const struct tu_bo *binary_bo, - bool binning_pass) + bool binning_pass, + struct tu_streamout_state *tf) { static const struct ir3_shader_variant dummy_variant = { .type = MESA_SHADER_NONE @@ -1060,7 +1169,12 @@ tu6_emit_program(struct tu_cs *cs, : &dummy_variant; if (binning_pass) { - vs = &builder->shaders[MESA_SHADER_VERTEX]->variants[1]; + /* if we have streamout, use full VS in binning pass, as the + * binning pass VS will have outputs on other than position/psize + * stripped out: + */ + if (vs->shader->stream_output.num_outputs == 0) + vs = &builder->shaders[MESA_SHADER_VERTEX]->variants[1]; fs = &dummy_variant; } @@ -1071,7 +1185,7 @@ tu6_emit_program(struct tu_cs *cs, tu6_emit_fs_config(cs, builder->shaders[MESA_SHADER_FRAGMENT], fs); tu6_emit_vs_system_values(cs, vs); - tu6_emit_vpc(cs, vs, fs, binning_pass); + tu6_emit_vpc(cs, vs, fs, binning_pass, tf); tu6_emit_vpc_varying_modes(cs, fs, binning_pass); tu6_emit_fs_inputs(cs, fs); tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count); @@ -1575,9 +1689,16 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder) if (builder->shaders[MESA_SHADER_VERTEX]->has_binning_pass) { const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; + const struct ir3_shader_variant *variant; + + if (vs->ir3_shader.stream_output.num_outputs) + variant = &vs->variants[0]; + else + variant = &vs->variants[1]; + builder->binning_vs_offset = builder->shader_total_size; builder->shader_total_size += - sizeof(uint32_t) * vs->variants[1].info.sizedwords; + sizeof(uint32_t) * variant->info.sizedwords; } return VK_SUCCESS; @@ -1609,8 +1730,19 @@ tu_pipeline_builder_upload_shaders(struct tu_pipeline_builder *builder, if (builder->shaders[MESA_SHADER_VERTEX]->has_binning_pass) { const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; - memcpy(bo->map + builder->binning_vs_offset, vs->binning_binary, - sizeof(uint32_t) * vs->variants[1].info.sizedwords); + const struct ir3_shader_variant *variant; + void *bin; + + if (vs->ir3_shader.stream_output.num_outputs) { + variant = &vs->variants[0]; + bin = vs->binary; + } else { + variant = &vs->variants[1]; + bin = vs->binning_binary; + } + + memcpy(bo->map + builder->binning_vs_offset, bin, + sizeof(uint32_t) * variant->info.sizedwords); } return VK_SUCCESS; @@ -1653,11 +1785,11 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, { struct tu_cs prog_cs; tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); - tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, false); + tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, false, &pipeline->streamout); pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); - tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true); + tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true, &pipeline->streamout); pipeline->program.binning_state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs);