From a6291b1b1177f5728e2e1998225f0b8676c6e710 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Wed, 12 Aug 2020 21:59:33 -0400 Subject: [PATCH] freedreno/ir3: rework setup_{input,output} to make struct varyings work Rework setup_{input,output} to be called during emit_intrinsic, in a way which allows struct/array/matrix type varyings to work. This allows turnip to pass dEQP-VK.glsl.linkage.varying.struct.* Signed-off-by: Jonathan Marek Part-of: --- .gitlab-ci/deqp-freedreno-a630-fails.txt | 1 - src/freedreno/ir3/ir3_compiler_nir.c | 263 +++++++----------- src/freedreno/ir3/ir3_parser.y | 1 - src/freedreno/ir3/ir3_shader.h | 3 +- src/freedreno/vulkan/tu_pipeline.c | 3 +- .../drivers/freedreno/a3xx/fd3_program.c | 2 +- .../drivers/freedreno/a4xx/fd4_program.c | 2 +- .../drivers/freedreno/a5xx/fd5_program.c | 2 +- .../drivers/freedreno/a6xx/fd6_program.c | 2 +- 9 files changed, 110 insertions(+), 169 deletions(-) diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt index 5808696ec51..8d06d4eaca3 100644 --- a/.gitlab-ci/deqp-freedreno-a630-fails.txt +++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt @@ -3,7 +3,6 @@ dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_neg_z_and_p dEQP-VK.binding_model.descriptorset_random.sets4.constant.ubolimitlow.sbolimithigh.imglimithigh.noiub.uab.frag.ialimitlow.0 dEQP-VK.draw.output_location.array.b8g8r8a8-unorm-mediump-output-vec3 -dEQP-VK.glsl.linkage.varying.struct.mat3x2 dEQP-VK.graphicsfuzz.mat-array-deep-control-flow dEQP-VK.spirv_assembly.instruction.compute.float_controls.fp32.input_args.negate_denorm_preserve dEQP-VK.spirv_assembly.instruction.compute.float_controls.fp32.input_args.rounding_rtz_out_prod diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index ddd9bf4a4f5..1438d31e74c 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -65,14 +65,16 @@ create_input(struct ir3_context *ctx, unsigned compmask) } static struct ir3_instruction * -create_frag_input(struct ir3_context *ctx, bool use_ldlv, unsigned n) +create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord, unsigned n) { struct ir3_block *block = ctx->block; struct ir3_instruction *instr; /* packed inloc is fixed up later: */ struct ir3_instruction *inloc = create_immed(block, n); - if (use_ldlv) { + if (coord) { + instr = ir3_BARY_F(block, inloc, 0, coord, 0); + } else if (ctx->compiler->flat_bypass) { instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); instr->cat6.type = TYPE_U32; instr->cat6.iim_val = 1; @@ -1342,7 +1344,6 @@ static void add_sysval_input_compmask(struct ir3_context *ctx, so->inputs[n].sysval = true; so->inputs[n].slot = slot; so->inputs[n].compmask = compmask; - so->inputs[n].interpolate = INTERP_MODE_FLAT; so->total_in++; } @@ -1471,6 +1472,9 @@ get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr) return ctx->frag_coord; } +static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr); +static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr); + static void emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) { @@ -1479,7 +1483,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) struct ir3_instruction * const *src; struct ir3_block *b = ctx->block; unsigned dest_components = nir_intrinsic_dest_components(intr); - int idx, comp; + int idx; if (info->has_dest) { dst = ir3_get_dst(ctx, &intr->dest, dest_components); @@ -1658,43 +1662,8 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) emit_intrinsic_barycentric(ctx, intr, dst); break; case nir_intrinsic_load_interpolated_input: - idx = nir_intrinsic_base(intr); - comp = nir_intrinsic_component(intr); - src = ir3_get_src(ctx, &intr->src[0]); - if (nir_src_is_const(intr->src[1])) { - struct ir3_instruction *coord = ir3_create_collect(ctx, src, 2); - idx += nir_src_as_uint(intr->src[1]); - for (int i = 0; i < dest_components; i++) { - unsigned inloc = idx * 4 + i + comp; - if (ctx->so->inputs[idx].bary && - !ctx->so->inputs[idx].use_ldlv) { - dst[i] = ir3_BARY_F(b, create_immed(b, inloc), 0, coord, 0); - } else { - /* for non-varyings use the pre-setup input, since - * that is easier than mapping things back to a - * nir_variable to figure out what it is. - */ - dst[i] = ctx->inputs[inloc]; - compile_assert(ctx, dst[i]); - } - } - } else { - ir3_context_error(ctx, "unhandled"); - } - break; case nir_intrinsic_load_input: - idx = nir_intrinsic_base(intr); - comp = nir_intrinsic_component(intr); - if (nir_src_is_const(intr->src[0])) { - idx += nir_src_as_uint(intr->src[0]); - for (int i = 0; i < dest_components; i++) { - unsigned n = idx * 4 + i + comp; - dst[i] = ctx->inputs[n]; - compile_assert(ctx, ctx->inputs[n]); - } - } else { - ir3_context_error(ctx, "unhandled"); - } + setup_input(ctx, intr); break; /* All SSBO intrinsics should have been lowered by 'lower_io_offsets' * pass and replaced by an ir3-specifc version that adds the @@ -1803,16 +1772,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) b = NULL; break; case nir_intrinsic_store_output: - idx = nir_intrinsic_base(intr); - comp = nir_intrinsic_component(intr); - compile_assert(ctx, nir_src_is_const(intr->src[1])); - idx += nir_src_as_uint(intr->src[1]); - - src = ir3_get_src(ctx, &intr->src[0]); - for (int i = 0; i < nir_intrinsic_src_components(intr, 0); i++) { - unsigned n = idx * 4 + i + comp; - ctx->outputs[n] = src[i]; - } + setup_output(ctx, intr); break; case nir_intrinsic_load_base_vertex: case nir_intrinsic_load_first_vertex: @@ -2949,92 +2909,53 @@ emit_function(struct ir3_context *ctx, nir_function_impl *impl) } static void -setup_input(struct ir3_context *ctx, nir_variable *in) +setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr) { struct ir3_shader_variant *so = ctx->so; - unsigned ncomp = glsl_get_components(in->type); - unsigned n = in->data.driver_location; - unsigned frac = in->data.location_frac; - unsigned slot = in->data.location; - unsigned compmask; + struct ir3_instruction *coord = NULL; - /* Inputs are loaded using ldlw or ldg for these stages. */ - if (ctx->so->type == MESA_SHADER_TESS_CTRL || - ctx->so->type == MESA_SHADER_TESS_EVAL || - ctx->so->type == MESA_SHADER_GEOMETRY) - return; + if (intr->intrinsic == nir_intrinsic_load_interpolated_input) + coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2); - /* skip unread inputs, we could end up with (for example), unsplit - * matrix/etc inputs in the case they are not read, so just silently - * skip these. - */ - if (ncomp > 4) - return; + compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0])); + + unsigned frac = nir_intrinsic_component(intr); + unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]); + unsigned ncomp = nir_intrinsic_dest_components(intr); + unsigned n = nir_intrinsic_base(intr) + offset; + unsigned slot = nir_intrinsic_io_semantics(intr).location + offset; + unsigned compmask; + + /* Inputs are loaded using ldlw or ldg for other stages. */ + compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT || + ctx->so->type == MESA_SHADER_VERTEX); if (ctx->so->type == MESA_SHADER_FRAGMENT) compmask = BITFIELD_MASK(ncomp) << frac; else compmask = BITFIELD_MASK(ncomp + frac); - /* remove any already set set components */ - compmask &= ~so->inputs[n].compmask; - if (!compmask) - return; + /* for a4xx+ rasterflat */ + if (so->inputs[n].rasterflat && ctx->so->key.rasterflat) + coord = NULL; + + so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask); so->inputs[n].slot = slot; so->inputs[n].compmask |= compmask; so->inputs_count = MAX2(so->inputs_count, n + 1); - so->inputs[n].interpolate = in->data.interpolation; + so->inputs[n].flat = !coord; if (ctx->so->type == MESA_SHADER_FRAGMENT) { + compile_assert(ctx, slot != VARYING_SLOT_POS); - /* if any varyings have 'sample' qualifer, that triggers us - * to run in per-sample mode: - */ - so->per_samp |= in->data.sample; + so->inputs[n].bary = true; for (int i = 0; i < ncomp; i++) { - struct ir3_instruction *instr = NULL; unsigned idx = (n * 4) + i + frac; - - if (!(compmask & (1 << (i + frac)))) - continue; - - if (slot == VARYING_SLOT_POS) { - ir3_context_error(ctx, "fragcoord should be a sysval!\n"); - } else { - /* detect the special case for front/back colors where - * we need to do flat vs smooth shading depending on - * rast state: - */ - if (in->data.interpolation == INTERP_MODE_NONE) { - switch (slot) { - case VARYING_SLOT_COL0: - case VARYING_SLOT_COL1: - case VARYING_SLOT_BFC0: - case VARYING_SLOT_BFC1: - so->inputs[n].rasterflat = true; - break; - default: - break; - } - } - - if (ctx->compiler->flat_bypass) { - if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) || - (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) - so->inputs[n].use_ldlv = true; - } - - so->inputs[n].bary = true; - - instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx); - } - - compile_assert(ctx, idx < ctx->ninputs && !ctx->inputs[idx]); - ctx->inputs[idx] = instr; + ctx->last_dst[i] = create_frag_input(ctx, coord, idx); } - } else if (ctx->so->type == MESA_SHADER_VERTEX) { + } else { struct ir3_instruction *input = NULL; foreach_input (in, ctx->ir) { @@ -3067,10 +2988,11 @@ setup_input(struct ir3_context *ctx, nir_variable *in) ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1); } - } - if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) { - so->total_in += util_bitcount(compmask); + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i + frac; + ctx->last_dst[i] = ctx->inputs[idx]; + } } } @@ -3173,14 +3095,18 @@ pack_inlocs(struct ir3_context *ctx) } static void -setup_output(struct ir3_context *ctx, nir_variable *out) +setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr) { struct ir3_shader_variant *so = ctx->so; - unsigned slots = glsl_count_vec4_slots(out->type, false, false); - unsigned ncomp = glsl_get_components(glsl_without_array(out->type)); - unsigned n = out->data.driver_location; - unsigned frac = out->data.location_frac; - unsigned slot = out->data.location; + nir_io_semantics io = nir_intrinsic_io_semantics(intr); + + compile_assert(ctx, nir_src_is_const(intr->src[1])); + + unsigned offset = nir_src_as_uint(intr->src[1]); + unsigned n = nir_intrinsic_base(intr) + offset; + unsigned frac = nir_intrinsic_component(intr); + unsigned ncomp = nir_intrinsic_src_components(intr, 0); + unsigned slot = io.location + offset; if (ctx->so->type == MESA_SHADER_FRAGMENT) { switch (slot) { @@ -3197,7 +3123,7 @@ setup_output(struct ir3_context *ctx, nir_variable *out) so->writes_stencilref = true; break; default: - slot += out->data.index; /* For dual-src blend */ + slot += io.dual_source_blend_index; /* For dual-src blend */ if (slot >= FRAG_RESULT_DATA0) break; ir3_context_error(ctx, "unknown FS output name: %s\n", @@ -3236,41 +3162,41 @@ setup_output(struct ir3_context *ctx, nir_variable *out) _mesa_shader_stage_to_string(ctx->so->type), gl_varying_slot_name(slot)); } - } else if (ctx->so->type == MESA_SHADER_TESS_CTRL) { - /* output lowered to buffer writes. */ - return; } else { ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type); } - so->outputs_count = out->data.driver_location + slots; + so->outputs_count = MAX2(so->outputs_count, n + 1); compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs)); - for (int i = 0; i < slots; i++) { - int slot_base = n + i; - so->outputs[slot_base].slot = slot + i; + so->outputs[n].slot = slot; - for (int i = 0; i < ncomp; i++) { - unsigned idx = (slot_base * 4) + i + frac; - compile_assert(ctx, idx < ctx->noutputs); + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i + frac; + compile_assert(ctx, idx < ctx->noutputs); + ctx->outputs[idx] = create_immed(ctx->block, fui(0.0)); + } + + /* if varying packing doesn't happen, we could end up in a situation + * with "holes" in the output, and since the per-generation code that + * sets up varying linkage registers doesn't expect to have more than + * one varying per vec4 slot, pad the holes. + * + * Note that this should probably generate a performance warning of + * some sort. + */ + for (int i = 0; i < frac; i++) { + unsigned idx = (n * 4) + i; + if (!ctx->outputs[idx]) { ctx->outputs[idx] = create_immed(ctx->block, fui(0.0)); } + } - /* if varying packing doesn't happen, we could end up in a situation - * with "holes" in the output, and since the per-generation code that - * sets up varying linkage registers doesn't expect to have more than - * one varying per vec4 slot, pad the holes. - * - * Note that this should probably generate a performance warning of - * some sort. - */ - for (int i = 0; i < frac; i++) { - unsigned idx = (slot_base * 4) + i; - if (!ctx->outputs[idx]) { - ctx->outputs[idx] = create_immed(ctx->block, fui(0.0)); - } - } + struct ir3_instruction * const *src = ir3_get_src(ctx, &intr->src[0]); + for (int i = 0; i < ncomp; i++) { + unsigned idx = (n * 4) + i + frac; + ctx->outputs[idx] = src[i]; } } @@ -3279,6 +3205,35 @@ emit_instructions(struct ir3_context *ctx) { nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); + /* some varying setup which can't be done in setup_input(): */ + if (ctx->so->type == MESA_SHADER_FRAGMENT) { + nir_foreach_shader_in_variable (var, ctx->s) { + /* if any varyings have 'sample' qualifer, that triggers us + * to run in per-sample mode: + */ + if (var->data.sample) + ctx->so->per_samp = true; + + /* set rasterflat flag for front/back color */ + if (var->data.interpolation == INTERP_MODE_NONE) { + switch (var->data.location) { + case VARYING_SLOT_COL0: + case VARYING_SLOT_COL1: + case VARYING_SLOT_BFC0: + case VARYING_SLOT_BFC1: + ctx->so->inputs[var->data.driver_location].rasterflat = true; + break; + default: + break; + } + } + } + } + + /* TODO: for GS/HS/DS, load_input isn't used. but ctx->s->num_inputs is non-zero + * likely the same for num_outputs in cases where store_output isn't used + */ + ctx->so->inputs_count = ctx->s->num_inputs; ctx->ninputs = ctx->s->num_inputs * 4; ctx->noutputs = ctx->s->num_outputs * 4; ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs); @@ -3303,11 +3258,6 @@ emit_instructions(struct ir3_context *ctx) ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3); } - /* Setup inputs: */ - nir_foreach_shader_in_variable (var, ctx->s) { - setup_input(ctx, var); - } - /* Defer add_sysval_input() stuff until after setup_inputs(), * because sysvals need to be appended after varyings: */ @@ -3351,11 +3301,6 @@ emit_instructions(struct ir3_context *ctx) break; } - /* Setup outputs: */ - nir_foreach_shader_out_variable (var, ctx->s) { - setup_output(ctx, var); - } - /* Find # of samplers. Just assume that we'll be reading from images.. if * it is write-only we don't have to count it, but after lowering derefs * is too late to compact indices for that. diff --git a/src/freedreno/ir3/ir3_parser.y b/src/freedreno/ir3/ir3_parser.y index e82035d59d1..945a3e0d1ce 100644 --- a/src/freedreno/ir3/ir3_parser.y +++ b/src/freedreno/ir3/ir3_parser.y @@ -178,7 +178,6 @@ static void add_sysval(unsigned reg, unsigned compmask, gl_system_value sysval) variant->inputs[n].sysval = true; variant->inputs[n].slot = sysval; variant->inputs[n].compmask = compmask; - variant->inputs[n].interpolate = INTERP_MODE_FLAT; variant->total_in++; } diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 53be9a6833d..db94e4f52c8 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -588,9 +588,8 @@ struct ir3_shader_variant { /* fragment shader specific: */ bool bary : 1; /* fetched varying (vs one loaded into reg) */ bool rasterflat : 1; /* special handling for emit->rasterflat */ - bool use_ldlv : 1; /* internal to ir3_compiler_nir */ bool half : 1; - enum glsl_interp_mode interpolate; + bool flat : 1; } inputs[32 + 2]; /* +POSITION +FACE */ /* sum of input components (scalar). For frag shaders, it only counts diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 58e2db66bcd..658f1cceb02 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -1069,8 +1069,7 @@ tu6_vpc_varying_mode(const struct ir3_shader_variant *fs, *interp_mode |= INTERP_ONE << 6; shift += 2; } - } else if ((fs->inputs[index].interpolate == INTERP_MODE_FLAT) || - fs->inputs[index].rasterflat) { + } else if (fs->inputs[index].flat) { for (int i = 0; i < 4; i++) { if (compmask & (1 << i)) { *interp_mode |= INTERP_FLAT << shift; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 891a52aa6de..c555f8ce44a 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -361,7 +361,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, uint32_t inloc = fp->inputs[j].inloc; - if ((fp->inputs[j].interpolate == INTERP_MODE_FLAT) || + if (fp->inputs[j].flat || (fp->inputs[j].rasterflat && emit->rasterflat)) { uint32_t loc = inloc; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index b5b13dc288e..79af5961485 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -465,7 +465,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, uint32_t inloc = s[FS].v->inputs[j].inloc; - if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) || + if (s[FS].v->inputs[j].flat || (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { uint32_t loc = inloc; diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c index 07aecff5a84..9ff6f3c6979 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c @@ -611,7 +611,7 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, uint32_t inloc = s[FS].v->inputs[j].inloc; - if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) || + if (s[FS].v->inputs[j].flat || (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { uint32_t loc = inloc; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index bf8aadced4c..5b6c2ca17d2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -940,7 +940,7 @@ emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs, uint32_t inloc = fs->inputs[j].inloc; - if ((fs->inputs[j].interpolate == INTERP_MODE_FLAT) || + if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) { uint32_t loc = inloc; -- 2.30.2