From 8d74749f812e64968d37266061293e204fea252c Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 27 Jun 2019 14:13:10 -0700 Subject: [PATCH] panfrost: Implement instanced rendering We implement GLES3.0 instanced rendering with full support for instanced arrays (via instance divisors). To do so, we use the new invocation helpers to invoke a triplet of (1, vertex_count, instance_count), rather than simply (1, vertex_count, 1). We rewrite the attribute handling code into a new pan_instancing.c file which handles both the simple LINEAR case for non-instanced as well as each of the new instancing cases: MODULO (for per-vertex attributes), POT and NPOT divisors. As a side effect, we rework how vertex buffers are handled, duplicating them to be 1:1 with vertex descriptors to simplify instancing code paths dramatically. This might be a performance regression, but this remains to be seen; if so, we can always deduplicate later with some added logic in pan_instancing.c Signed-off-by: Alyssa Rosenzweig --- .../drivers/panfrost/include/panfrost-job.h | 16 +- src/gallium/drivers/panfrost/meson.build | 1 + .../panfrost/midgard/midgard_compile.c | 4 +- src/gallium/drivers/panfrost/pan_context.c | 123 ++++--- src/gallium/drivers/panfrost/pan_context.h | 26 ++ src/gallium/drivers/panfrost/pan_instancing.c | 341 ++++++++++++++++++ src/gallium/drivers/panfrost/pan_invocation.c | 1 + src/gallium/drivers/panfrost/pan_screen.c | 4 + .../drivers/panfrost/pandecode/decode.c | 112 +++++- 9 files changed, 568 insertions(+), 60 deletions(-) create mode 100644 src/gallium/drivers/panfrost/pan_instancing.c diff --git a/src/gallium/drivers/panfrost/include/panfrost-job.h b/src/gallium/drivers/panfrost/include/panfrost-job.h index 444e5ad9e69..5c93f319b65 100644 --- a/src/gallium/drivers/panfrost/include/panfrost-job.h +++ b/src/gallium/drivers/panfrost/include/panfrost-job.h @@ -834,8 +834,9 @@ struct mali_attr_meta { /* Always observed to be zero at the moment */ unsigned unknown3 : 2; - /* When packing multiple attributes in a buffer, offset addresses by this value */ - uint32_t src_offset; + /* When packing multiple attributes in a buffer, offset addresses by + * this value. Obscurely, this is signed. */ + int32_t src_offset; } __attribute__((packed)); enum mali_fbd_type { @@ -1061,7 +1062,16 @@ struct midgard_payload_vertex_tiler { u32 zero3; #endif - u32 gl_enables; // 0x5 + u16 gl_enables; // 0x5 + + /* Both zero for non-instanced draws. For instanced draws, a + * decomposition of padded_num_vertices. See the comments about the + * corresponding fields in mali_attr for context. */ + + unsigned instance_shift : 5; + unsigned instance_odd : 3; + + u8 zero4; /* Offset for first vertex in buffer */ u32 draw_start; diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index 80cfee794db..b69b41bfd90 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -58,6 +58,7 @@ files_panfrost = files( 'pan_pretty_print.c', 'pan_fragment.c', 'pan_invocation.c', + 'pan_instancing.c', 'pan_scoreboard.c', 'pan_sfbd.c', 'pan_mfbd.c', diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c index 4a399293af0..5559aa44454 100644 --- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c +++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c @@ -1255,7 +1255,9 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; /* Get the base type of the intrinsic */ - nir_alu_type t = nir_intrinsic_type(instr); + /* TODO: Infer type? Does it matter? */ + nir_alu_type t = + is_ubo ? nir_type_uint : nir_intrinsic_type(instr); t = nir_alu_type_get_base_type(t); if (!is_ubo) { diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 871b168040c..88e70c97881 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -552,7 +552,7 @@ panfrost_emit_point_coord(union mali_attr *slot) static void panfrost_emit_varying_descriptor( struct panfrost_context *ctx, - unsigned invocation_count) + unsigned vertex_count) { /* Load the shaders */ @@ -638,19 +638,19 @@ panfrost_emit_varying_descriptor( unsigned idx = 0; panfrost_emit_varyings(ctx, &varyings[idx++], num_gen_varyings * 16, - invocation_count); + vertex_count); /* fp32 vec4 gl_Position */ ctx->payload_tiler.postfix.position_varying = panfrost_emit_varyings(ctx, &varyings[idx++], - sizeof(float) * 4, invocation_count); + sizeof(float) * 4, vertex_count); if (vs->writes_point_size || fs->reads_point_coord) { /* fp16 vec1 gl_PointSize */ ctx->payload_tiler.primitive_size.pointer = panfrost_emit_varyings(ctx, &varyings[idx++], - 2, invocation_count); + 2, vertex_count); } if (fs->reads_point_coord) { @@ -663,7 +663,7 @@ panfrost_emit_varying_descriptor( ctx->payload_tiler.postfix.varyings = varyings_p; } -static mali_ptr +mali_ptr panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i) { struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i]; @@ -672,48 +672,6 @@ panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i) return rsrc->bo->gpu + buf->buffer_offset; } -/* Emits attributes and varying descriptors, which should be called every draw, - * excepting some obscure circumstances */ - -static void -panfrost_emit_vertex_data(struct panfrost_context *ctx, struct panfrost_job *job) -{ - /* Staged mali_attr, and index into them. i =/= k, depending on the - * vertex buffer mask */ - union mali_attr attrs[PIPE_MAX_ATTRIBS]; - unsigned k = 0; - - unsigned invocation_count = MALI_NEGATIVE(ctx->payload_tiler.prefix.invocation_count); - - for (int i = 0; i < ARRAY_SIZE(ctx->vertex_buffers); ++i) { - if (!(ctx->vb_mask & (1 << i))) continue; - - struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i]; - struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); - - if (!rsrc) continue; - - /* Align to 64 bytes by masking off the lower bits. This - * will be adjusted back when we fixup the src_offset in - * mali_attr_meta */ - - mali_ptr addr = panfrost_vertex_buffer_address(ctx, i) & ~63; - - /* Offset vertex count by draw_start to make sure we upload enough */ - attrs[k].stride = buf->stride; - attrs[k].size = rsrc->base.width0; - - panfrost_job_add_bo(job, rsrc->bo); - attrs[k].elements = addr | MALI_ATTR_LINEAR; - - ++k; - } - - ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, k * sizeof(union mali_attr)); - - panfrost_emit_varying_descriptor(ctx, invocation_count); -} - static bool panfrost_writes_point_size(struct panfrost_context *ctx) { @@ -759,12 +717,24 @@ panfrost_stage_attributes(struct panfrost_context *ctx) * QED. */ + unsigned start = ctx->payload_vertex.draw_start; + for (unsigned i = 0; i < so->num_elements; ++i) { unsigned vbi = so->pipe[i].vertex_buffer_index; + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; mali_ptr addr = panfrost_vertex_buffer_address(ctx, vbi); /* Adjust by the masked off bits of the offset */ target[i].src_offset += (addr & 63); + + /* Also, somewhat obscurely per-instance data needs to be + * offset in response to a delayed start in an indexed draw */ + + if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start) { + target[i].src_offset -= buf->stride * start; + } + + } ctx->payload_vertex.postfix.attribute_meta = transfer.gpu; @@ -1028,7 +998,11 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); if (with_vertex_data) { - panfrost_emit_vertex_data(ctx, job); + panfrost_emit_vertex_data(job); + + /* Varyings emitted for -all- geometry */ + unsigned total_count = ctx->padded_count * ctx->instance_count; + panfrost_emit_varying_descriptor(ctx, total_count); } bool msaa = ctx->rasterizer->base.multisample; @@ -1580,9 +1554,11 @@ panfrost_get_index_buffer_mapped(struct panfrost_context *ctx, const struct pipe struct panfrost_resource *rsrc = (struct panfrost_resource *) (info->index.resource); off_t offset = info->start * info->index_size; + struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); if (!info->has_user_indices) { /* Only resources can be directly mapped */ + panfrost_job_add_bo(batch, rsrc->bo); return rsrc->bo->gpu + offset; } else { /* Otherwise, we need to upload to transient memory */ @@ -1657,6 +1633,7 @@ panfrost_draw_vbo( ctx->payload_tiler.prefix.draw_mode = g2m_draw_mode(mode); ctx->vertex_count = info->count; + ctx->instance_count = info->instance_count; /* For non-indexed draws, they're the same */ unsigned vertex_count = ctx->vertex_count; @@ -1673,9 +1650,20 @@ panfrost_draw_vbo( /* For higher amounts of vertices (greater than what fits in a 16-bit * short), the other value is needed, otherwise there will be bizarre - * rendering artefacts. It's not clear what these values mean yet. */ + * rendering artefacts. It's not clear what these values mean yet. This + * change is also needed for instancing and sometimes points (perhaps + * related to dynamically setting gl_PointSize) */ + + bool is_points = mode == PIPE_PRIM_POINTS; + bool many_verts = ctx->vertex_count > 0xFFFF; + bool instanced = ctx->instance_count > 1; - draw_flags |= (mode == PIPE_PRIM_POINTS || ctx->vertex_count > 65535) ? 0x3000 : 0x18000; + draw_flags |= (is_points || many_verts || instanced) ? 0x3000 : 0x18000; + + /* This doesn't make much sense */ + if (mode == PIPE_PRIM_LINE_STRIP) { + draw_flags |= 0x800; + } if (info->index_size) { /* Calculate the min/max index used so we can figure out how @@ -1721,11 +1709,42 @@ panfrost_draw_vbo( panfrost_pack_work_groups_fused( &ctx->payload_vertex.prefix, &ctx->payload_tiler.prefix, - 1, vertex_count, 1, + 1, vertex_count, info->instance_count, 1, 1, 1); ctx->payload_tiler.prefix.unknown_draw = draw_flags; + /* Encode the padded vertex count */ + + if (info->instance_count > 1) { + /* Triangles have non-even vertex counts so they change how + * padding works internally */ + + bool is_triangle = + mode == PIPE_PRIM_TRIANGLES || + mode == PIPE_PRIM_TRIANGLE_STRIP || + mode == PIPE_PRIM_TRIANGLE_FAN; + + struct pan_shift_odd so = + panfrost_padded_vertex_count(vertex_count, !is_triangle); + + ctx->payload_vertex.instance_shift = so.shift; + ctx->payload_tiler.instance_shift = so.shift; + + ctx->payload_vertex.instance_odd = so.odd; + ctx->payload_tiler.instance_odd = so.odd; + + ctx->padded_count = pan_expand_shift_odd(so); + } else { + ctx->padded_count = ctx->vertex_count; + + /* Reset instancing state */ + ctx->payload_vertex.instance_shift = 0; + ctx->payload_vertex.instance_odd = 0; + ctx->payload_tiler.instance_shift = 0; + ctx->payload_tiler.instance_odd = 0; + } + /* Fire off the draw itself */ panfrost_queue_draw(ctx); } @@ -1807,7 +1826,7 @@ panfrost_create_vertex_elements_state( panfrost_allocate_chunk(pan_context(pctx), 0, HEAP_DESCRIPTOR); for (int i = 0; i < num_elements; ++i) { - so->hw[i].index = elements[i].vertex_buffer_index; + so->hw[i].index = i; enum pipe_format fmt = elements[i].src_format; const struct util_format_description *desc = util_format_description(fmt); diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index f83083be0fc..a913c8581ef 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -152,6 +152,11 @@ struct panfrost_context { int dirty; unsigned vertex_count; + unsigned instance_count; + + /* If instancing is enabled, vertex count padded for instance; if + * it is disabled, just equal to plain vertex count */ + unsigned padded_count; union mali_attr attributes[PIPE_MAX_ATTRIBS]; @@ -364,6 +369,27 @@ panfrost_pack_work_groups_fused( unsigned size_y, unsigned size_z); +/* Instancing */ + +mali_ptr +panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i); + +void +panfrost_emit_vertex_data(struct panfrost_job *batch); + +struct pan_shift_odd { + unsigned shift; + unsigned odd; +}; + +struct pan_shift_odd +panfrost_padded_vertex_count( + unsigned vertex_count, + bool primitive_pot); + + +unsigned +pan_expand_shift_odd(struct pan_shift_odd o); #endif diff --git a/src/gallium/drivers/panfrost/pan_instancing.c b/src/gallium/drivers/panfrost/pan_instancing.c new file mode 100644 index 00000000000..2d38b75a200 --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_instancing.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "pan_context.h" + +/* See mali_job for notes on how this works. But basically, for small vertex + * counts, we have a lookup table, and for large vertex counts, we look at the + * high bits as a heuristic. This has to match exactly how the hardware + * calculates this (which is why the algorithm is so weird) or else instancing + * will break. */ + +/* Given an odd number (of the form 2k + 1), compute k */ +#define ODD(odd) ((odd - 1) >> 1) + +/* Given the shift/odd pair, recover the original padded integer */ + +unsigned +pan_expand_shift_odd(struct pan_shift_odd o) +{ + unsigned odd = 2*o.odd + 1; + unsigned shift = 1 << o.shift; + return odd * shift; +} + +static inline struct pan_shift_odd +pan_factored(unsigned pot, unsigned odd) +{ + struct pan_shift_odd out; + + assert(util_is_power_of_two_or_zero(pot)); + assert(odd & 1); + + /* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1. + * + * So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1) + * = k | 0 = k */ + + out.odd = (odd >> 1); + + /* POT is the form (1 << shift) */ + out.shift = __builtin_ctz(pot); + + return out; +} + + +/* For small vertices. Second argument is whether the primitive takes a + * power-of-two argument, which determines how rounding works. True for POINTS + * and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy + * to try instanced quads on ES class hardware <3 */ + +static struct { + unsigned pot; + unsigned odd; +} small_lut[] = { + { 0, 1 }, + { 1, 1 }, + { 2, 1 }, + { 1, 3 }, + { 4, 1 }, + { 1, 5 }, + { 2, 3 }, + { 1, 7 }, + { 8, 1 }, + { 1, 9 }, + { 2, 5 }, + { 4, 3 }, /* 11 */ + { 4, 3 }, + { 2, 7 }, /* 13 */ + { 2, 7 }, + { 16, 1 }, /* 15 */ + { 16, 1 }, + { 2, 9 }, + { 4, 5 }, /* 20 */ + { 4, 5 } +}; + +static struct pan_shift_odd +panfrost_small_padded_vertex_count(unsigned idx) +{ + return pan_factored( + small_lut[idx].pot, + small_lut[idx].odd); +} + +static struct pan_shift_odd +panfrost_large_padded_vertex_count(uint32_t vertex_count) +{ + struct pan_shift_odd out = { 0 }; + + /* First, we have to find the highest set one */ + unsigned highest = 32 - __builtin_clz(vertex_count); + + /* Using that, we mask out the highest 4-bits */ + unsigned n = highest - 4; + unsigned nibble = (vertex_count >> n) & 0xF; + + /* Great, we have the nibble. Now we can just try possibilities. Note + * that we don't care about the bottom most bit in most cases, and we + * know the top bit must be 1 */ + + unsigned middle_two = (nibble >> 1) & 0x3; + + switch (middle_two) { + case 0b00: + if (nibble & 1) + return pan_factored(1 << n, 9); + else + return pan_factored(1 << (n + 1), 5); + case 0b01: + return pan_factored(1 << (n + 2), 3); + case 0b10: + return pan_factored(1 << (n + 1), 7); + case 0b11: + return pan_factored(1 << (n + 4), 1); + default: + unreachable("Invalid two bits"); + } + + return out; +} + +struct pan_shift_odd +panfrost_padded_vertex_count( + unsigned vertex_count, + bool pot) +{ + assert(vertex_count > 0); + + if (vertex_count < 20) { + /* Add an off-by-one if it won't align naturally (quirk of the hardware) */ + //if (!pot) + // vertex_count++; + + return panfrost_small_padded_vertex_count(vertex_count); + } else + return panfrost_large_padded_vertex_count(vertex_count); +} + +/* The much, much more irritating case -- instancing is enabled. See + * panfrost_job.h for notes on how this works */ + +static unsigned +panfrost_vertex_instanced( + struct panfrost_job *batch, + struct panfrost_resource *rsrc, + unsigned divisor, + union mali_attr *attrs, + mali_ptr addr, + unsigned vertex_count, + unsigned instance_count) +{ + /* First, grab the padded vertex count */ + + struct pan_shift_odd o = { + .shift = batch->ctx->payload_tiler.instance_shift, + .odd = batch->ctx->payload_tiler.instance_odd, + }; + + unsigned padded_count = batch->ctx->padded_count; + + /* Depending if there is an instance divisor or not, packing varies. + * When there is a divisor, the hardware-level divisor is actually the + * product of the instance divisor and the padded count */ + + unsigned hw_divisor = padded_count * divisor; + + if (divisor == 0) { + /* Per-vertex attributes use the MODULO mode. First, compute + * the modulus */ + + attrs->elements |= MALI_ATTR_MODULO; + attrs->shift = o.shift; + attrs->extra_flags = o.odd; + + return 1; + } else if (util_is_power_of_two_or_zero(hw_divisor)) { + /* If there is a divisor but the hardware divisor works out to + * a power of two (not terribly exceptional), we can use an + * easy path (just shifting) */ + + attrs->elements |= MALI_ATTR_POT_DIVIDE; + attrs->shift = __builtin_ctz(hw_divisor); + + return 1; + } else { + /* We have a NPOT divisor. Here's the fun one (multipling by + * the inverse and shifting) */ + + /* floor(log2(d)) */ + unsigned shift = util_logbase2(hw_divisor); + + /* m = ceil(2^(32 + shift) / d) */ + uint64_t shift_hi = 32 + shift; + uint64_t t = 1ll << shift_hi; + double t_f = t; + double hw_divisor_d = hw_divisor; + double m_f = ceil(t_f / hw_divisor_d); + unsigned m = m_f; + + /* Default case */ + unsigned magic_divisor = m, extra_flags = 0; + + /* e = 2^(shift + 32) % d */ + uint64_t e = t % hw_divisor; + + /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob + * seems to use a different condition */ + if (e <= (1 << shift)) { + magic_divisor = m - 1; + extra_flags = 1; + } + + /* Top flag implicitly set */ + assert(magic_divisor & (1 << 31)); + magic_divisor &= ~(1 << 31); + + /* Upload to two different slots */ + + attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE; + attrs[0].shift = shift; + attrs[0].extra_flags = extra_flags; + + attrs[1].unk = 0x20; + attrs[1].magic_divisor = magic_divisor; + attrs[1].zero = 0; + attrs[1].divisor = divisor; + + return 2; + } +} + +void +panfrost_emit_vertex_data(struct panfrost_job *batch) +{ + struct panfrost_context *ctx = batch->ctx; + struct panfrost_vertex_state *so = ctx->vertex; + + /* Staged mali_attr, and index into them. i =/= k, depending on the + * vertex buffer mask and instancing. Twice as much room is allocated, + * for a worst case of NPOT_DIVIDEs which take up extra slot */ + union mali_attr attrs[PIPE_MAX_ATTRIBS * 2]; + unsigned k = 0; + + unsigned vertex_count = ctx->vertex_count; + unsigned instanced_count = ctx->instance_count; + + for (unsigned i = 0; i < so->num_elements; ++i) { + /* We map a mali_attr to be 1:1 with the mali_attr_meta, which + * means duplicating some vertex buffers (who cares? aside from + * maybe some caching implications but I somehow doubt that + * matters) */ + + struct pipe_vertex_element *elem = &so->pipe[i]; + unsigned vbi = elem->vertex_buffer_index; + + /* The exception to 1:1 mapping is that we can have multiple + * entries (NPOT divisors), so we fixup anyways */ + + so->hw[i].index = k; + + if (!(ctx->vb_mask & (1 << vbi))) continue; + + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; + struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); + + if (!rsrc) continue; + + /* Align to 64 bytes by masking off the lower bits. This + * will be adjusted back when we fixup the src_offset in + * mali_attr_meta */ + + mali_ptr raw_addr = panfrost_vertex_buffer_address(ctx, vbi); + mali_ptr addr = raw_addr & ~63; + unsigned chopped_addr = raw_addr - addr; + + /* Add a dependency of the batch on the vertex buffer */ + panfrost_job_add_bo(batch, rsrc->bo); + + /* Set common fields */ + attrs[k].elements = addr; + attrs[k].stride = buf->stride; + attrs[k].size = rsrc->base.width0; + + /* We need to add the extra size we masked off (for + * correctness) so the data doesn't get clamped away */ + attrs[k].size += chopped_addr; + + /* Instancing uses a dramatically different code path than + * linear, so dispatch for the actual emission now that the + * common code is finished */ + + unsigned divisor = elem->instance_divisor; + + if (divisor && instanced_count == 1) { + /* Silly corner case where there's a divisor(=1) but + * there's no legitimate instancing. So we want *every* + * attribute to be the same. So set stride to zero so + * we don't go anywhere. */ + + attrs[k].size = attrs[k].stride + chopped_addr; + attrs[k].stride = 0; + attrs[k++].elements |= MALI_ATTR_LINEAR; + } else if (instanced_count <= 1) { + /* Normal, non-instanced attributes */ + attrs[k++].elements |= MALI_ATTR_LINEAR; + } else { + k += panfrost_vertex_instanced( + batch, rsrc, divisor, &attrs[k], addr, vertex_count, instanced_count); + } + } + + /* Upload whatever we emitted and go */ + + ctx->payload_vertex.postfix.attributes = + panfrost_upload_transient(ctx, attrs, k * sizeof(union mali_attr)); +} + + diff --git a/src/gallium/drivers/panfrost/pan_invocation.c b/src/gallium/drivers/panfrost/pan_invocation.c index 0d4945d05b1..1d1a301d67c 100644 --- a/src/gallium/drivers/panfrost/pan_invocation.c +++ b/src/gallium/drivers/panfrost/pan_invocation.c @@ -120,6 +120,7 @@ panfrost_pack_work_groups_fused( tiler->size_y_shift = vertex->size_y_shift; tiler->size_z_shift = vertex->size_z_shift; tiler->workgroups_x_shift = vertex->workgroups_x_shift; + tiler->workgroups_x_shift_2 = vertex->workgroups_x_shift_2; tiler->workgroups_y_shift = vertex->workgroups_y_shift; tiler->workgroups_z_shift = vertex->workgroups_z_shift; diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index f8463d2b3f8..d6b1bc89fc1 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -118,6 +118,10 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: return 1; + /* TODO: Where does this req come from in practice? */ + case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: + return 1; + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: return 4096; case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: diff --git a/src/gallium/drivers/panfrost/pandecode/decode.c b/src/gallium/drivers/panfrost/pandecode/decode.c index 4cc7ca03995..189c2482316 100644 --- a/src/gallium/drivers/panfrost/pandecode/decode.c +++ b/src/gallium/drivers/panfrost/pandecode/decode.c @@ -859,6 +859,100 @@ pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets) return MALI_NEGATIVE(fb->rt_count_1); } +/* Just add a comment decoding the shift/odd fields forming the padded vertices + * count */ + +static void +pandecode_padded_vertices(unsigned shift, unsigned k) +{ + unsigned odd = 2*k + 1; + unsigned pot = 1 << shift; + pandecode_msg("padded_num_vertices = %d\n", odd * pot); +} + +/* Given a magic divisor, recover what we were trying to divide by. + * + * Let m represent the magic divisor. By definition, m is an element on Z, whre + * 0 <= m < 2^N, for N bits in m. + * + * Let q represent the number we would like to divide by. + * + * By definition of a magic divisor for N-bit unsigned integers (a number you + * multiply by to magically get division), m is a number such that: + * + * (m * x) & (2^N - 1) = floor(x/q). + * for all x on Z where 0 <= x < 2^N + * + * Ignore the case where any of the above values equals zero; it is irrelevant + * for our purposes (instanced arrays). + * + * Choose x = q. Then: + * + * (m * x) & (2^N - 1) = floor(x/q). + * (m * q) & (2^N - 1) = floor(q/q). + * + * floor(q/q) = floor(1) = 1, therefore: + * + * (m * q) & (2^N - 1) = 1 + * + * Recall the identity that the bitwise AND of one less than a power-of-two + * equals the modulo with that power of two, i.e. for all x: + * + * x & (2^N - 1) = x % N + * + * Therefore: + * + * mq % (2^N) = 1 + * + * By definition, a modular multiplicative inverse of a number m is the number + * q such that with respect to a modulos M: + * + * mq % M = 1 + * + * Therefore, q is the modular multiplicative inverse of m with modulus 2^N. + * + */ + +static void +pandecode_magic_divisor(uint32_t magic, unsigned shift, unsigned orig_divisor, unsigned extra) +{ + /* Compute the modular inverse of `magic` with respect to 2^(32 - + * shift) the most lame way possible... just repeatedly add. + * Asymptoptically slow but nobody cares in practice, unless you have + * massive numbers of vertices or high divisors. */ + + unsigned inverse = 0; + + /* Magic implicitly has the highest bit set */ + magic |= (1 << 31); + + /* Depending on rounding direction */ + if (extra) + magic++; + + for (;;) { + uint32_t product = magic * inverse; + + if (shift) { + product >>= shift; + } + + if (product == 1) + break; + + ++inverse; + } + + pandecode_msg("dividing by %d (maybe off by two)\n", inverse); + + /* Recall we're supposed to divide by (gl_level_divisor * + * padded_num_vertices) */ + + unsigned padded_num_vertices = inverse / orig_divisor; + + pandecode_msg("padded_num_vertices = %d\n", padded_num_vertices); +} + static void pandecode_replay_attributes(const struct pandecode_mapped_memory *mem, mali_ptr addr, int job_no, char *suffix, @@ -905,9 +999,9 @@ pandecode_replay_attributes(const struct pandecode_mapped_memory *mem, /* Decode further where possible */ if (mode == MALI_ATTR_MODULO) { - unsigned odd = (2 * attr[i].extra_flags) + 1; - unsigned pot = (1 << attr[i].shift); - pandecode_msg("padded_num_vertices = %d\n", odd * pot); + pandecode_padded_vertices( + attr[i].shift, + attr[i].extra_flags); } pandecode_indent--; @@ -922,6 +1016,7 @@ pandecode_replay_attributes(const struct pandecode_mapped_memory *mem, if (attr[i].zero != 0) pandecode_prop("zero = 0x%x /* XXX zero tripped */", attr[i].zero); pandecode_prop("divisor = %d", attr[i].divisor); + pandecode_magic_divisor(attr[i].magic_divisor, attr[i - 1].shift, attr[i].divisor, attr[i - 1].extra_flags); pandecode_indent--; pandecode_log("}, \n"); } @@ -1114,7 +1209,7 @@ pandecode_replay_attribute_meta(int job_no, int count, const struct mali_vertex_ pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1); pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3); - pandecode_prop("src_offset = 0x%" PRIx64, (u64) attr_meta->src_offset); + pandecode_prop("src_offset = %d", attr_meta->src_offset); pandecode_indent--; pandecode_log("},\n"); @@ -2040,6 +2135,15 @@ pandecode_replay_vertex_or_tiler_job_mdg(const struct mali_job_descriptor_header pandecode_replay_gl_enables(v->gl_enables, h->job_type); + if (v->instance_shift || v->instance_odd) { + pandecode_prop("instance_shift = 0x%d /* %d */", + v->instance_shift, 1 << v->instance_shift); + pandecode_prop("instance_odd = 0x%X /* %d */", + v->instance_odd, (2 * v->instance_odd) + 1); + + pandecode_padded_vertices(v->instance_shift, v->instance_odd); + } + if (v->draw_start) pandecode_prop("draw_start = %d", v->draw_start); -- 2.30.2