From: Alyssa Rosenzweig Date: Wed, 27 Nov 2019 13:31:16 +0000 (-0500) Subject: panfrost: Implement pan_tiler for non-hierarchy GPUs X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9fb0904712a5d64ecd012f65dfda4f36819716d9;p=mesa.git panfrost: Implement pan_tiler for non-hierarchy GPUs The algorithm is as described. Nothing fancy here, just need to add some new code paths depending on which model we're running on. Tomeu: - Also disable tiling when !hierarchy and !vertex_count - Avoid creating polygon lists smaller than the minimum when vertex_count > 0 but tile size smaller than 16 byte - Take into account tile size when calculating polygon list size for !hierarchy - Allow 0-sized tiles in a single dimension Signed-off-by: Alyssa Rosenzweig Signed-off-by: Tomeu Vizoso --- diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index af8f94354e9..7ae4b692189 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -59,24 +59,25 @@ static struct midgard_tiler_descriptor panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count) { struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen); + bool hierarchy = !(screen->quirks & MIDGARD_NO_HIER_TILING); struct midgard_tiler_descriptor t = {0}; unsigned height = batch->key.height; unsigned width = batch->key.width; t.hierarchy_mask = - panfrost_choose_hierarchy_mask(width, height, vertex_count); + panfrost_choose_hierarchy_mask(width, height, vertex_count, hierarchy); /* Compute the polygon header size and use that to offset the body */ unsigned header_size = panfrost_tiler_header_size( - width, height, t.hierarchy_mask); + width, height, t.hierarchy_mask, hierarchy); t.polygon_list_size = panfrost_tiler_full_size( - width, height, t.hierarchy_mask); + width, height, t.hierarchy_mask, hierarchy); /* Sanity check */ - if (t.hierarchy_mask) { + if (vertex_count) { struct panfrost_bo *tiler_heap; tiler_heap = panfrost_batch_get_tiler_heap(batch); @@ -92,6 +93,7 @@ panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count) struct panfrost_bo *tiler_dummy; tiler_dummy = panfrost_batch_get_tiler_dummy(batch); + header_size = MALI_TILER_MINIMUM_HEADER_SIZE; /* The tiler is disabled, so don't allow the tiler heap */ t.heap_start = tiler_dummy->gpu; @@ -101,11 +103,11 @@ panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count) t.polygon_list = tiler_dummy->gpu; /* Disable the tiler */ - t.hierarchy_mask |= MALI_TILER_DISABLED; - - if (screen->quirks & MIDGARD_SFBD) { - t.hierarchy_mask = 0xFFF; /* TODO: What's this? */ - t.polygon_list_size = 0x200; + if (hierarchy) + t.hierarchy_mask |= MALI_TILER_DISABLED; + else { + t.hierarchy_mask = MALI_TILER_USER; + t.polygon_list_size = MALI_TILER_MINIMUM_HEADER_SIZE + 4; /* We don't have a SET_VALUE job, so write the polygon list manually */ uint32_t *polygon_list_body = (uint32_t *) (tiler_dummy->cpu + header_size); diff --git a/src/gallium/drivers/panfrost/pan_scoreboard.c b/src/gallium/drivers/panfrost/pan_scoreboard.c index f340bb62662..02dd7f0b669 100644 --- a/src/gallium/drivers/panfrost/pan_scoreboard.c +++ b/src/gallium/drivers/panfrost/pan_scoreboard.c @@ -302,7 +302,8 @@ panfrost_scoreboard_set_value(struct panfrost_batch *batch) /* Okay, we do. Let's generate it. We'll need the job's polygon list * regardless of size. */ - mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch, 0); + mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch, + MALI_TILER_MINIMUM_HEADER_SIZE); struct panfrost_transfer job = panfrost_set_value_job(batch, polygon_list); diff --git a/src/panfrost/encoder/pan_encoder.h b/src/panfrost/encoder/pan_encoder.h index 8aa2df7240b..ceff2e949de 100644 --- a/src/panfrost/encoder/pan_encoder.h +++ b/src/panfrost/encoder/pan_encoder.h @@ -56,14 +56,14 @@ panfrost_pack_work_groups_fused( /* Tiler structure size computation */ unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask); +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); unsigned -panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask); +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); unsigned panfrost_choose_hierarchy_mask( unsigned width, unsigned height, - unsigned vertex_count); + unsigned vertex_count, bool hierarchy); #endif diff --git a/src/panfrost/encoder/pan_tiler.c b/src/panfrost/encoder/pan_tiler.c index 98ef6827a80..fc42724a1e5 100644 --- a/src/panfrost/encoder/pan_tiler.c +++ b/src/panfrost/encoder/pan_tiler.c @@ -218,13 +218,6 @@ /* Likewise, each tile per level has 512 bytes of body */ #define FULL_BYTES_PER_TILE 0x200 -/* Absent any geometry, the minimum size of the header */ -#define MINIMUM_HEADER_SIZE 0x200 - -/* Mask of valid hierarchy levels: one bit for each level from min...max - * inclusive */ -#define HIERARCHY_MASK (((MAX_TILE_SIZE / MIN_TILE_SIZE) << 1) - 1) - /* If the width-x-height framebuffer is divided into tile_size-x-tile_size * tiles, how many tiles are there? Rounding up in each direction. For the * special case of tile_size=16, this aligns with the usual Midgard count. @@ -233,108 +226,86 @@ * a a fixed-tile size (not any of a number of power-of-twos) */ static unsigned -pan_tile_count(unsigned width, unsigned height, unsigned tile_size) +pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height) { - unsigned aligned_width = ALIGN_POT(width, tile_size); - unsigned aligned_height = ALIGN_POT(height, tile_size); + unsigned aligned_width = ALIGN_POT(width, tile_width); + unsigned aligned_height = ALIGN_POT(height, tile_height); - unsigned tile_count_x = aligned_width / tile_size; - unsigned tile_count_y = aligned_height / tile_size; + unsigned tile_count_x = aligned_width / tile_width; + unsigned tile_count_y = aligned_height / tile_height; return tile_count_x * tile_count_y; } /* For `masked_count` of the smallest tile sizes masked out, computes how the * size of the polygon list header. We iterate the tile sizes (16x16 through - * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count) - * through 2048x2048 more generally. For each tile size, we figure out how many - * tiles there are at this hierarchy level and therefore many bytes this level - * is, leaving us with a byte count for each level. We then just sum up the - * byte counts across the levels to find a byte count for all levels. */ + * 2048x2048). For each tile size, we figure out how many tiles there are at + * this hierarchy level and therefore many bytes this level is, leaving us with + * a byte count for each level. We then just sum up the byte counts across the + * levels to find a byte count for all levels. */ static unsigned -panfrost_raw_segment_size( +panfrost_hierarchy_size( unsigned width, unsigned height, - unsigned masked_count, - unsigned end_level, + unsigned mask, unsigned bytes_per_tile) { unsigned size = PROLOGUE_SIZE; - /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more - * if anything is masked off */ - - unsigned start_level = MIN_TILE_SHIFT + masked_count; + /* Iterate hierarchy levels */ - /* Iterate hierarchy levels / tile sizes */ + for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) { + /* Check if this level is enabled */ + if (!(mask & (1 << b))) + continue; - for (unsigned i = start_level; i <= end_level; ++i) { /* Shift from a level to a tile size */ - unsigned tile_size = (1 << i); + unsigned tile_size = (1 << b) * MIN_TILE_SIZE; - unsigned tile_count = pan_tile_count(width, height, tile_size); + unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size); unsigned level_count = bytes_per_tile * tile_count; size += level_count; } /* This size will be used as an offset, so ensure it's aligned */ - return ALIGN_POT(size, 512); + return ALIGN_POT(size, 0x200); } -/* Given a hierarchy mask and a framebuffer size, compute the size of one of - * the segments (header or body) */ +/* Implement the formula: + * + * 0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h) + * + * rounding down the answer to the nearest 0x200. This is used to compute both + * header and body sizes for GPUs without hierarchical tiling. Essentially, + * computing a single hierarchy level, since there isn't any hierarchy! + */ static unsigned -panfrost_segment_size( - unsigned width, unsigned height, - unsigned mask, unsigned bytes_per_tile) +panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile) { - /* The tiler-disabled case should have been handled by the caller */ - assert(mask); + /* First, extract the tile dimensions */ - /* Some levels are enabled. Ensure that only smaller levels are - * disabled and there are no gaps. Theoretically the hardware is more - * flexible, but there's no known reason to use other configurations - * and this keeps the code simple. Since we know the 0x80 or 0x100 bit - * is set, ctz(mask) will return the number of masked off levels. */ + unsigned tw = (1 << (dim & 0b111)) * 8; + unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8; - unsigned masked_count = __builtin_ctz(mask); + /* tile_count is ceil(W/w) * ceil(H/h) */ + unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile; - assert(mask & (0x80 | 0x100)); - assert(((mask >> masked_count) & ((mask >> masked_count) + 1)) == 0); - - /* Figure out the top level */ - unsigned unused_count = __builtin_clz(mask); - unsigned top_bit = ((8 * sizeof(mask)) - 1) - unused_count; - - /* We don't have bits for nonexistant levels below 16x16 */ - unsigned top_level = top_bit + 4; - - /* Everything looks good. Use the number of trailing zeroes we found to - * figure out how many smaller levels are disabled to compute the - * actual header size */ - - return panfrost_raw_segment_size(width, height, - masked_count, top_level, bytes_per_tile); + /* Round down and add offset */ + return 0x200 + ((raw / 0x200) * 0x200); } - /* Given a hierarchy mask and a framebuffer size, compute the header size */ unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask) +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) { - mask &= HIERARCHY_MASK; - - /* If no hierarchy levels are enabled, that means there is no geometry - * for the tiler to process, so use a minimum size. Used for clears */ - - if (mask == 0x00) - return MINIMUM_HEADER_SIZE; - - return panfrost_segment_size(width, height, mask, HEADER_BYTES_PER_TILE); + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE); } /* The combined header/body is sized similarly (but it is significantly @@ -343,14 +314,38 @@ panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask) */ unsigned -panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask) +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +{ + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE); +} + +/* On GPUs without hierarchical tiling, we choose a tile size directly and + * stuff it into the field otherwise known as hierarchy mask (not a mask). */ + +static unsigned +panfrost_choose_tile_size( + unsigned width, unsigned height, unsigned vertex_count) { - mask &= HIERARCHY_MASK; + /* Figure out the ideal tile size. Eventually a heuristic should be + * used for this */ + + unsigned best_w = 16; + unsigned best_h = 16; + + /* Clamp so there are less than 64 tiles in each direction */ - if (mask == 0x00) - return MINIMUM_HEADER_SIZE; + best_w = MAX2(best_w, util_next_power_of_two(width / 63)); + best_h = MAX2(best_h, util_next_power_of_two(height / 63)); - return panfrost_segment_size(width, height, mask, FULL_BYTES_PER_TILE); + /* We have our ideal tile size, so encode */ + + unsigned exp_w = util_logbase2(best_w / 16); + unsigned exp_h = util_logbase2(best_h / 16); + + return exp_w | (exp_h << 6); } /* In the future, a heuristic to choose a tiler hierarchy mask would go here. @@ -362,13 +357,16 @@ panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask) unsigned panfrost_choose_hierarchy_mask( unsigned width, unsigned height, - unsigned vertex_count) + unsigned vertex_count, bool hierarchy) { /* If there is no geometry, we don't bother enabling anything */ if (!vertex_count) return 0x00; + if (!hierarchy) + return panfrost_choose_tile_size(width, height, vertex_count); + /* Otherwise, default everything on. TODO: Proper tests */ return 0xFF; diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h index 06a45d186d3..ff0d9aa9036 100644 --- a/src/panfrost/include/panfrost-job.h +++ b/src/panfrost/include/panfrost-job.h @@ -1392,9 +1392,17 @@ struct mali_payload_fragment { /* See pan_tiler.c for derivation */ #define MALI_HIERARCHY_MASK ((1 << 9) - 1) -/* Flag disabling the tiler for clear-only jobs */ +/* Flag disabling the tiler for clear-only jobs, with + hierarchical tiling */ #define MALI_TILER_DISABLED (1 << 12) +/* Flag selecting userspace-generated polygon list, for clear-only jobs without + * hierarhical tiling. */ +#define MALI_TILER_USER 0xFFF + +/* Absent any geometry, the minimum size of the polygon list header */ +#define MALI_TILER_MINIMUM_HEADER_SIZE 0x200 + struct midgard_tiler_descriptor { /* Size of the entire polygon list; see pan_tiler.c for the * computation. It's based on hierarchical tiling */ diff --git a/src/panfrost/pandecode/decode.c b/src/panfrost/pandecode/decode.c index 847a0d7e7b0..89992a76a79 100644 --- a/src/panfrost/pandecode/decode.c +++ b/src/panfrost/pandecode/decode.c @@ -513,7 +513,8 @@ pandecode_midgard_tiler_descriptor( const struct midgard_tiler_descriptor *t, unsigned width, unsigned height, - bool is_fragment) + bool is_fragment, + bool has_hierarchy) { pandecode_log(".tiler = {\n"); pandecode_indent++; @@ -546,8 +547,8 @@ pandecode_midgard_tiler_descriptor( /* Now that we've sanity checked, we'll try to calculate the sizes * ourselves for comparison */ - unsigned ref_header = panfrost_tiler_header_size(width, height, t->hierarchy_mask); - unsigned ref_size = panfrost_tiler_full_size(width, height, t->hierarchy_mask); + unsigned ref_header = panfrost_tiler_header_size(width, height, t->hierarchy_mask, has_hierarchy); + unsigned ref_size = panfrost_tiler_full_size(width, height, t->hierarchy_mask, has_hierarchy); if (!((ref_header == body_offset) && (ref_size == t->polygon_list_size))) { pandecode_msg("XXX: bad polygon list size (expected %d / 0x%x)\n", @@ -630,44 +631,6 @@ pandecode_midgard_tiler_descriptor( pandecode_log("}\n"); } -static void -pandecode_midgard_tiler_descriptor_0x20( - const struct midgard_tiler_descriptor *t) -{ - pandecode_log(".tiler = {\n"); - pandecode_indent++; - - pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask); - pandecode_prop("flags = 0x%" PRIx16, t->flags); - MEMORY_PROP(t, polygon_list); - MEMORY_PROP(t, polygon_list_body); - pandecode_prop("polygon_list_size = 0x%x", t->polygon_list_size); - MEMORY_PROP(t, heap_start); - MEMORY_PROP(t, heap_end); - - /* We've never seen weights used in practice, but we know from the - * kernel these fields are there */ - - bool nonzero_weights = false; - - for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) { - nonzero_weights |= t->weights[w] != 0x0; - } - - if (nonzero_weights) { - pandecode_log(".weights = {"); - - for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) { - pandecode_log("%d, ", t->weights[w]); - } - - pandecode_log("},"); - } - - pandecode_indent--; - pandecode_log("}\n"); -} - /* Information about the framebuffer passed back for * additional analysis */ @@ -792,11 +755,9 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) MEMORY_PROP(s, unknown_address_0); const struct midgard_tiler_descriptor t = s->tiler; - if (gpu_id == 0x0720 || gpu_id == 0x0820 || gpu_id == 0x0830) - /* These ones don't have an "Advanced Tiling Unit" */ - pandecode_midgard_tiler_descriptor_0x20(&t); - else - pandecode_midgard_tiler_descriptor(&t, s->width + 1, s->height + 1, is_fragment); + + bool has_hierarchy = !(gpu_id == 0x0720 || gpu_id == 0x0820 || gpu_id == 0x0830); + pandecode_midgard_tiler_descriptor(&t, s->width + 1, s->height + 1, is_fragment, has_hierarchy); pandecode_indent--; pandecode_log("};\n"); @@ -1157,7 +1118,7 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment) pandecode_prop("unknown2 = 0x%x", fb->unknown2); MEMORY_PROP(fb, scratchpad); const struct midgard_tiler_descriptor t = fb->tiler; - pandecode_midgard_tiler_descriptor(&t, fb->width1 + 1, fb->height1 + 1, is_fragment); + pandecode_midgard_tiler_descriptor(&t, fb->width1 + 1, fb->height1 + 1, is_fragment, true); if (fb->zero3 || fb->zero4) { pandecode_msg("XXX: framebuffer zeros tripped\n");