From b98955e128ca3470d89c925f58567f2369f8072a Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 15 Mar 2019 23:25:55 +0000 Subject: [PATCH] panfrost: Rewrite varying assembly There are two stages to varying assembly in the command stream: creating the varying buffers in the command stream, and creating the varying meta descriptors (also in the command stream) linked to the aforementioned buffers. The previous code for this was ad hoc and brittle, making some invalid assumptions causing unmaintainable workarounds to pile up across the driver (both compiler and command stream side). This patch completely rewrites the varying assembly code. There's a trivial performance penalty (we now memcpy the varying meta to the command stream on draw, rather than on compile). That said, the improvement in flexibility and clarity is well-worth it. The motivator for these changes was support for gl_PointCoord (and eventually point sprites for legacy GL), which was impossible to implement with the old varying assembly code. With the new refactor, it's super easy; support for gl_PointCoord is included with this patch. All in all, I'm quite happy with how this turned out. Signed-off-by: Alyssa Rosenzweig --- .../drivers/panfrost/include/panfrost-job.h | 7 + .../panfrost/midgard/midgard_compile.c | 52 +------ .../panfrost/midgard/midgard_compile.h | 2 + src/gallium/drivers/panfrost/pan_assemble.c | 144 +++++------------- src/gallium/drivers/panfrost/pan_context.c | 127 ++++++++++----- src/gallium/drivers/panfrost/pan_context.h | 26 +--- 6 files changed, 141 insertions(+), 217 deletions(-) diff --git a/src/gallium/drivers/panfrost/include/panfrost-job.h b/src/gallium/drivers/panfrost/include/panfrost-job.h index 85ef02d04e0..f0a4de73085 100644 --- a/src/gallium/drivers/panfrost/include/panfrost-job.h +++ b/src/gallium/drivers/panfrost/include/panfrost-job.h @@ -754,6 +754,13 @@ enum mali_attr_mode { MALI_ATTR_NPOT_DIVIDE = 4, }; +/* This magic "pseudo-address" is used as `elements` to implement + * gl_PointCoord. When read from a fragment shader, it generates a point + * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces + * require an affine transformation in the shader. */ + +#define MALI_VARYING_POINT_COORD (0x60) + union mali_attr { /* This is used for actual attributes. */ struct { diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c index 7f3b6997ca0..87c504061aa 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c @@ -431,9 +431,6 @@ typedef struct compiler_context { struct hash_table_u64 *uniform_nir_to_mdg; int uniform_count; - struct hash_table_u64 *varying_nir_to_mdg; - int varying_count; - /* Just the count of the max register used. Higher count => higher * register pressure */ int work_registers; @@ -1398,17 +1395,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) * guarantee correctness when considering some * (common) edge cases XXX: FIXME */ - /* Look up how it was actually laid out */ - - void *entry = _mesa_hash_table_u64_search(ctx->varying_nir_to_mdg, offset + 1); - - if (!entry) { - DBG("WARNING: skipping varying\n"); - break; - } - - offset = (uintptr_t) (entry) - 1; - /* If this varying corresponds to a constant (why?!), * emit that now since it won't get picked up by * hoisting (since there is no corresponding move @@ -3455,42 +3441,16 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl } } - if (ctx->stage == MESA_SHADER_VERTEX) { - ctx->varying_nir_to_mdg = _mesa_hash_table_u64_create(NULL); - - /* First, collect the special varyings */ - nir_foreach_variable(var, &nir->outputs) { - if (var->data.location == VARYING_SLOT_POS) { - /* Set position first, always. It takes up two - * spots, the latter one is de facto unused (at - * least from the shader's perspective), we - * just need to skip over the spot*/ - - _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + 1, (void *) ((uintptr_t) (0 + 1))); - ctx->varying_count = MAX2(ctx->varying_count, 2); - } else if (var->data.location == VARYING_SLOT_PSIZ) { - /* Set point size second (third, see above) */ - _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + 1, (void *) ((uintptr_t) (2 + 1))); - ctx->varying_count = MAX2(ctx->varying_count, 3); - - program->writes_point_size = true; - } - } + /* Record the varying mapping for the command stream's bookkeeping */ - /* Now, collect normal varyings */ - - nir_foreach_variable(var, &nir->outputs) { - if (var->data.location == VARYING_SLOT_POS || var->data.location == VARYING_SLOT_PSIZ) continue; + struct exec_list *varyings = + ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs; - for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) { - int id = ctx->varying_count++; - _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1))); - } - } + nir_foreach_variable(var, varyings) { + unsigned loc = var->data.driver_location; + program->varyings[loc] = var->data.location; } - - /* Lower vars -- not I/O -- before epilogue */ NIR_PASS_V(nir, nir_lower_var_copies); diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.h b/src/gallium/drivers/panfrost/midgard/midgard_compile.h index 6225d041c07..a6520091a21 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_compile.h +++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.h @@ -35,6 +35,8 @@ typedef struct { int attribute_count; int varying_count; + unsigned varyings[32]; + /* Boolean properties of the program */ bool can_discard; bool writes_point_size; diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c index d1ecfd4ac03..c9abe9e6687 100644 --- a/src/gallium/drivers/panfrost/pan_assemble.c +++ b/src/gallium/drivers/panfrost/pan_assemble.c @@ -88,125 +88,55 @@ panfrost_shader_compile(struct panfrost_context *ctx, struct mali_shader_meta *m state->can_discard = program.can_discard; state->writes_point_size = program.writes_point_size; + state->reads_point_coord = false; /* Separate as primary uniform count is truncated */ state->uniform_count = program.uniform_count; - /* gl_Position eats up an extra spot */ - if (type == JOB_TYPE_VERTEX) - meta->varying_count += 1; - - /* Note: gl_FragCoord does -not- eat an extra spot; it will be included - * in our count if we need it */ - meta->midgard1.unknown2 = 8; /* XXX */ - /* Varyings are known only through the shader. We choose to upload this - * information with the vertex shader, though the choice is perhaps - * arbitrary */ - - if (type == JOB_TYPE_VERTEX) { - struct panfrost_varyings *varyings = &state->varyings; - - /* Measured in vec4 words. Don't include gl_Position */ - int varying_count = program.varying_count; - - /* Setup two buffers, one for position, the other for normal - * varyings, as seen in traces. TODO: Are there other - * configurations we might use? */ + unsigned default_vec1_swizzle = panfrost_get_default_swizzle(1); + unsigned default_vec2_swizzle = panfrost_get_default_swizzle(2); + unsigned default_vec4_swizzle = panfrost_get_default_swizzle(4); - varyings->varying_buffer_count = 2; + /* Iterate the varyings and emit the corresponding descriptor */ + unsigned general_purpose_count = 0; - /* mediump vec4s sequentially */ - varyings->varyings_stride[0] = (2 * sizeof(float)) * varying_count; + for (unsigned i = 0; i < program.varying_count; ++i) { + unsigned location = program.varyings[i]; - /* highp gl_Position */ - varyings->varyings_stride[1] = 4 * sizeof(float); - - /* mediump gl_PointSize */ - if (program.writes_point_size) { - ++varyings->varying_buffer_count; - varyings->varyings_stride[2] = 2; /* sizeof(fp16) */ - } - - /* Setup gl_Position, its weirdo analogue, and gl_PointSize (optionally) */ - unsigned default_vec1_swizzle = panfrost_get_default_swizzle(1); - unsigned default_vec4_swizzle = panfrost_get_default_swizzle(4); - - struct mali_attr_meta vertex_special_varyings[] = { - { - .index = 1, - .format = MALI_VARYING_POS, - - .swizzle = default_vec4_swizzle, - .unknown1 = 0x2, - }, - { - .index = 1, - .format = MALI_RGBA16F, - - /* TODO: Wat? yyyy swizzle? */ - .swizzle = 0x249, - .unknown1 = 0x0, - }, - { - .index = 2, - .format = MALI_R16F, - .swizzle = default_vec1_swizzle, - .unknown1 = 0x2 - } + /* Default to a vec4 varying */ + struct mali_attr_meta v = { + .format = MALI_RGBA16F, + .swizzle = default_vec4_swizzle, + .unknown1 = 0x2, }; - /* How many special vertex varyings are actually required? */ - int vertex_special_count = 2 + (program.writes_point_size ? 1 : 0); - - /* Setup actual varyings. XXX: Don't assume vec4 */ - - struct mali_attr_meta mali_varyings[PIPE_MAX_ATTRIBS]; - - for (int i = 0; i < varying_count; ++i) { - struct mali_attr_meta vec4_varying_meta = { - .index = 0, - .format = MALI_RGBA16F, - .swizzle = default_vec4_swizzle, - .unknown1 = 0x2, - - /* Set offset to keep everything back-to-back in - * the same buffer */ - .src_offset = 8 * i, - }; - - mali_varyings[i] = vec4_varying_meta; + /* Check for special cases, otherwise assume general varying */ + + if (location == VARYING_SLOT_POS) { + v.index = 1; + v.format = MALI_VARYING_POS; + } else if (location == VARYING_SLOT_PSIZ) { + v.index = 2; + v.format = MALI_R16F; + v.swizzle = default_vec1_swizzle; + + state->writes_point_size = true; + } else if (location == VARYING_SLOT_PNTC) { + v.index = 3; + v.format = MALI_RG16F; + v.swizzle = default_vec2_swizzle; + + state->reads_point_coord = true; + } else { + v.index = 0; + v.src_offset = 8 * (general_purpose_count++); } - /* We don't count the weirdo gl_Position in our varying count */ - varyings->varying_count = varying_count - 1; - - /* In this context, position_meta represents the implicit - * gl_FragCoord varying. So, upload all the varyings */ - - unsigned varyings_size = sizeof(struct mali_attr_meta) * varyings->varying_count; - unsigned vertex_special_size = sizeof(struct mali_attr_meta) * vertex_special_count; - unsigned vertex_size = vertex_special_size + varyings_size; - unsigned fragment_size = varyings_size + sizeof(struct mali_attr_meta); - - struct panfrost_transfer transfer = panfrost_allocate_chunk(ctx, vertex_size + fragment_size, HEAP_DESCRIPTOR); - - /* Copy varyings in the follow order: - * - Position 1, 2 - * - Varyings 1, 2, ..., n - * - Varyings 1, 2, ..., n (duplicate) - * - Position 1 - */ - - memcpy(transfer.cpu, vertex_special_varyings, vertex_special_size); - memcpy(transfer.cpu + vertex_special_size, mali_varyings, varyings_size); - memcpy(transfer.cpu + vertex_size, mali_varyings, varyings_size); - memcpy(transfer.cpu + vertex_size + varyings_size, &vertex_special_varyings[0], sizeof(struct mali_attr_meta)); - - /* Point to the descriptor */ - varyings->varyings_buffer_cpu = transfer.cpu; - varyings->varyings_descriptor = transfer.gpu; - varyings->varyings_descriptor_fragment = transfer.gpu + vertex_size; + state->varyings[i] = v; } + + /* Set the stride for the general purpose fp16 vec4 varyings */ + state->general_varying_stride = (2 * 4) * general_purpose_count; } diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index b6cf5302cae..10eb6795bae 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -647,6 +647,90 @@ panfrost_set_value_job(struct panfrost_context *ctx) ctx->set_value_job = transfer.gpu; } +static mali_ptr +panfrost_emit_varyings( + struct panfrost_context *ctx, + union mali_attr *slot, + unsigned stride, + unsigned count) +{ + mali_ptr varying_address = ctx->varying_mem.gpu + ctx->varying_height; + + /* Fill out the descriptor */ + slot->elements = varying_address | MALI_ATTR_LINEAR; + slot->stride = stride; + slot->size = stride * count; + + ctx->varying_height += ALIGN(slot->size, 64); + assert(ctx->varying_height < ctx->varying_mem.size); + + return varying_address; +} + +static void +panfrost_emit_point_coord(union mali_attr *slot) +{ + slot->elements = MALI_VARYING_POINT_COORD | MALI_ATTR_LINEAR; + slot->stride = slot->size = 0; +} + +static void +panfrost_emit_varying_descriptor( + struct panfrost_context *ctx, + unsigned invocation_count) +{ + /* Load the shaders */ + + struct panfrost_shader_state *vs = &ctx->vs->variants[ctx->vs->active_variant]; + struct panfrost_shader_state *fs = &ctx->fs->variants[ctx->fs->active_variant]; + + /* Allocate the varying descriptor */ + + size_t vs_size = sizeof(struct mali_attr_meta) * vs->tripipe->varying_count; + size_t fs_size = sizeof(struct mali_attr_meta) * fs->tripipe->varying_count; + + struct panfrost_transfer trans = panfrost_allocate_transient(ctx, + vs_size + fs_size); + + memcpy(trans.cpu, vs->varyings, vs_size); + memcpy(trans.cpu + vs_size, fs->varyings, fs_size); + + ctx->payload_vertex.postfix.varying_meta = trans.gpu; + ctx->payload_tiler.postfix.varying_meta = trans.gpu + vs_size; + + /* Buffer indices must be in this order per our convention */ + union mali_attr varyings[PIPE_MAX_ATTRIBS]; + unsigned idx = 0; + + /* General varyings -- use the VS's, since those are more likely to be + * accurate on desktop */ + + panfrost_emit_varyings(ctx, &varyings[idx++], + vs->general_varying_stride, invocation_count); + + /* fp32 vec4 gl_Position */ + ctx->payload_tiler.postfix.position_varying = + panfrost_emit_varyings(ctx, &varyings[idx++], + sizeof(float) * 4, invocation_count); + + + if (vs->writes_point_size || fs->reads_point_coord) { + /* fp16 vec1 gl_PointSize */ + ctx->payload_tiler.primitive_size.pointer = + panfrost_emit_varyings(ctx, &varyings[idx++], + 2, invocation_count); + } + + if (fs->reads_point_coord) { + /* Special descriptor */ + panfrost_emit_point_coord(&varyings[idx++]); + } + + mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr)); + ctx->payload_vertex.postfix.varyings = varyings_p; + ctx->payload_tiler.postfix.varyings = varyings_p; +} + /* Emits attributes and varying descriptors, which should be called every draw, * excepting some obscure circumstances */ @@ -655,7 +739,6 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx) { /* TODO: Only update the dirtied buffers */ union mali_attr attrs[PIPE_MAX_ATTRIBS]; - union mali_attr varyings[PIPE_MAX_ATTRIBS]; unsigned invocation_count = MALI_NEGATIVE(ctx->payload_tiler.prefix.invocation_count); @@ -698,39 +781,9 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx) } } - struct panfrost_varyings *vars = &ctx->vs->variants[ctx->vs->active_variant].varyings; - - for (int i = 0; i < vars->varying_buffer_count; ++i) { - mali_ptr varying_address = ctx->varying_mem.gpu + ctx->varying_height; - - varyings[i].elements = varying_address | 1; - varyings[i].stride = vars->varyings_stride[i]; - varyings[i].size = vars->varyings_stride[i] * invocation_count; - - /* If this varying has to be linked somewhere, do it now. See - * pan_assemble.c for the indices. TODO: Use a more generic - * linking interface */ - - if (i == 1) { - /* gl_Position */ - ctx->payload_tiler.postfix.position_varying = varying_address; - } else if (i == 2) { - /* gl_PointSize */ - ctx->payload_tiler.primitive_size.pointer = varying_address; - } - - /* Varyings appear to need 64-byte alignment */ - ctx->varying_height += ALIGN(varyings[i].size, 64); - - /* Ensure that we fit */ - assert(ctx->varying_height < ctx->varying_mem.size); - } - ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, ctx->vertex_buffer_count * sizeof(union mali_attr)); - mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, vars->varying_buffer_count * sizeof(union mali_attr)); - ctx->payload_vertex.postfix.varyings = varyings_p; - ctx->payload_tiler.postfix.varyings = varyings_p; + panfrost_emit_varying_descriptor(ctx, invocation_count); } /* Go through dirty flags and actualise them in the cmdstream. */ @@ -773,6 +826,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) struct panfrost_shader_state *vs = &ctx->vs->variants[ctx->vs->active_variant]; /* Late shader descriptor assignments */ + vs->tripipe->texture_count = ctx->sampler_view_count[PIPE_SHADER_VERTEX]; vs->tripipe->sampler_count = ctx->sampler_count[PIPE_SHADER_VERTEX]; @@ -780,15 +834,6 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) vs->tripipe->midgard1.unknown1 = 0x2201; ctx->payload_vertex.postfix._shader_upper = vs->tripipe_gpu >> 4; - - /* Varying descriptor is tied to the vertex shader. Also the - * fragment shader, I suppose, but it's generated with the - * vertex shader so */ - - struct panfrost_varyings *varyings = &ctx->vs->variants[ctx->vs->active_variant].varyings; - - ctx->payload_vertex.postfix.varying_meta = varyings->varyings_descriptor; - ctx->payload_tiler.postfix.varying_meta = varyings->varyings_descriptor_fragment; } if (ctx->dirty & (PAN_DIRTY_RASTERIZER | PAN_DIRTY_VS)) { diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index 128b9b27c55..7947169f83b 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -241,27 +241,6 @@ struct panfrost_blend_state { int blend_work_count; }; -/* Internal varyings descriptor */ -struct panfrost_varyings { - /* Varyings information: stride of each chunk of memory used for - * varyings (similar structure with attributes). Count is just the - * number of vec4's. Buffer count is the number of varying chunks (<= - * count). Height is used to calculate gl_Position's position ("it's - * not a pun, Alyssa!"). Vertex-only varyings == descriptor for - * gl_Position and something else apparently occupying the same space. - * Varyings == main varyings descriptors following typical mali_attr - * conventions. */ - - unsigned varyings_stride[MAX_VARYINGS]; - unsigned varying_count; - unsigned varying_buffer_count; - - /* Map of the actual varyings buffer */ - uint8_t *varyings_buffer_cpu; - mali_ptr varyings_descriptor; - mali_ptr varyings_descriptor_fragment; -}; - /* Variants bundle together to form the backing CSO, bundling multiple * shaders with varying emulated features baked in (alpha test * parameters, etc) */ @@ -280,9 +259,10 @@ struct panfrost_shader_state { int uniform_count; bool can_discard; bool writes_point_size; + bool reads_point_coord; - /* Valid for vertex shaders only due to when this is calculated */ - struct panfrost_varyings varyings; + unsigned general_varying_stride; + struct mali_attr_meta varyings[PIPE_MAX_ATTRIBS]; /* Information on this particular shader variant */ struct pipe_alpha_state alpha_state; -- 2.30.2