From: Alyssa Rosenzweig Date: Mon, 19 Aug 2019 18:19:15 +0000 (-0700) Subject: panfrost: Move pan_tiler.c outside of Gallium X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b45eb2775ea180fde01d0abe50f6ccf8ddb5b84d;p=mesa.git panfrost: Move pan_tiler.c outside of Gallium The routines in this file may be shared with Vulkan. Signed-off-by: Alyssa Rosenzweig --- diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index d4e91c6812a..8d4facec982 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -49,7 +49,6 @@ files_panfrost = files( 'pan_scoreboard.c', 'pan_sfbd.c', 'pan_mfbd.c', - 'pan_tiler.c', 'pan_varyings.c', ) diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index e6ef85b1a8a..b6de11c10eb 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -50,7 +50,6 @@ #include "pan_blending.h" #include "pan_blend_shaders.h" #include "pan_util.h" -#include "pan_tiler.h" /* Do not actually send anything to the GPU; merely generate the cmdstream as fast as possible. Disables framebuffer writes */ //#define DRY_RUN diff --git a/src/gallium/drivers/panfrost/pan_tiler.c b/src/gallium/drivers/panfrost/pan_tiler.c deleted file mode 100644 index 25f8490cb43..00000000000 --- a/src/gallium/drivers/panfrost/pan_tiler.c +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include "util/u_math.h" -#include "util/macros.h" -#include "pan_tiler.h" - -/* Mali GPUs are tiled-mode renderers, rather than immediate-mode. - * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run. - * Then, a fixed-function hardware block (the tiler) consumes the gl_Position - * results. For each triangle specified, it marks each containing tile as - * containing that triangle. This set of "triangles per tile" form the "polygon - * list". Finally, the rasterization unit consumes the polygon list to invoke - * the fragment shader. - * - * In practice, it's a bit more complicated than this. 16x16 is the logical - * tile size, but Midgard features "hierarchical tiling", where power-of-two - * multiples of the base tile size can be used: hierarchy level 0 (16x16), - * level 1 (32x32), level 2 (64x64), per public information about Midgard's - * tiling. In fact, tiling goes up to 2048x2048 (!), although in practice - * 128x128 is the largest usually used (though higher modes are enabled). The - * idea behind hierarchical tiling is to use low tiling levels for small - * triangles and high levels for large triangles, to minimize memory bandwidth - * and repeated fragment shader invocations (the former issue inherent to - * immediate-mode rendering and the latter common in traditional tilers). - * - * The tiler itself works by reading varyings in and writing a polygon list - * out. Unfortunately (for us), both of these buffers are managed in main - * memory; although they ideally will be cached, it is the drivers' - * responsibility to allocate these buffers. Varying buffer allocation is - * handled elsewhere, as it is not tiler specific; the real issue is allocating - * the polygon list. - * - * This is hard, because from the driver's perspective, we have no information - * about what geometry will actually look like on screen; that information is - * only gained from running the vertex shader. (Theoretically, we could run the - * vertex shaders in software as a prepass, or in hardware with transform - * feedback as a prepass, but either idea is ludicrous on so many levels). - * - * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list - * into three distinct pieces. First, the driver statically determines which - * tile hierarchy levels to use (more on that later). At this point, we know the - * framebuffer dimensions and all the possible tilings of the framebuffer, so - * we know exactly how many tiles exist across all hierarchy levels. The first - * piece of the polygon list is the header, which is exactly 8 bytes per tile, - * plus padding and a small 64-byte prologue. (If that doesn't remind you of - * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is - * the polygon list body, which seems to contain 512 bytes per tile, again - * across every level of the hierarchy. These two parts form the polygon list - * buffer. This buffer has a statically determinable size, approximately equal - * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus - * alignment / minimum restrictions / etc. - * - * The third piece is the easy one (for us): the tiler heap. In essence, the - * tiler heap is a gigantic slab that's as big as could possibly be necessary - * in the worst case imaginable. Just... a gigantic allocation that we give a - * start and end pointer to. What's the catch? The tiler heap is lazily - * allocated; that is, a huge amount of memory is _reserved_, but only a tiny - * bit is actually allocated upfront. The GPU just keeps using the - * unallocated-but-reserved portions as it goes along, generating page faults - * if it goes beyond the allocation, and then the kernel is instructed to - * expand the allocation on page fault (known in the vendor kernel as growable - * memory). This is quite a bit of bookkeeping of its own, but that task is - * pushed to kernel space and we can mostly ignore it here, just remembering to - * set the GROWABLE flag so the kernel actually uses this path rather than - * allocating a gigantic amount up front and burning a hole in RAM. - * - * As far as determining which hierarchy levels to use, the simple answer is - * that right now, we don't. In the tiler configuration fields (consistent from - * the earliest Midgard's SFBD through the latest Bifrost traces we have), - * there is a hierarchy_mask field, controlling which levels (tile sizes) are - * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to - * big tiles and small polygons to small tiles -- would be realized here as - * well. As long as there are polygons at all needing tiling, we always have to - * have big tiles available, in case there are big polygons. But we don't - * necessarily need small tiles available. Ideally, when there are small - * polygons, small tiles are enabled (to avoid waste from putting small - * triangles in the big tiles); when there are not, small tiles are disabled to - * avoid enabling more levels than necessary, which potentially costs in memory - * bandwidth / power / tiler performance. - * - * Of course, the driver has to figure this out statically. When tile - * hiearchies are actually established, this occurs by the tiler in - * fixed-function hardware, after the vertex shaders have run and there is - * sufficient information to figure out the size of triangles. The driver has - * no such luxury, again barring insane hacks like additionally running the - * vertex shaders in software or in hardware via transform feedback. Thus, for - * the driver, we need a heuristic approach. - * - * There are lots of heuristics to guess triangle size statically you could - * imagine, but one approach shines as particularly simple-stupid: assume all - * on-screen triangles are equal size and spread equidistantly throughout the - * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with - * it, then we see: - * - * Triangle Area = (Screen Area / # of triangles) - * = (Width * Height) / (# of triangles) - * - * Or if you prefer, we can also make a third CRAZY assumption that we only draw - * right triangles with edges parallel/perpendicular to the sides of the screen - * with no overdraw, forming a triangle grid across the screen: - * - * |--w--| - * _____ | - * | /| /| | - * |/_|/_| h - * | /| /| | - * |/_|/_| | - * - * Then you can use some middle school geometry and algebra to work out the - * triangle dimensions. I started working on this, but realised I didn't need - * to to make my point, but couldn't bare to erase that ASCII art. Anyway. - * - * POINT IS, by considering the ratio of screen area and triangle count, we can - * estimate the triangle size. For a small size, use small bins; for a large - * size, use large bins. Intuitively, this metric makes sense: when there are - * few triangles on a large screen, you're probably compositing a UI and - * therefore the triangles are large; when there are a lot of triangles on a - * small screen, you're probably rendering a 3D mesh and therefore the - * triangles are tiny. (Or better said -- there will be tiny triangles, even if - * there are also large triangles. There have to be unless you expect crazy - * overdraw. Generally, it's better to allow more small bin sizes than - * necessary than not allow enough.) - * - * From this heuristic (or whatever), we determine the minimum allowable tile - * size, and we use that to decide the hierarchy masking, selecting from the - * minimum "ideal" tile size to the maximum tile size (2048x2048). - * - * Once we have that mask and the framebuffer dimensions, we can compute the - * size of the statically-sized polygon list structures, allocate them, and go! - * - */ - -/* Hierarchical tiling spans from 16x16 to 2048x2048 tiles */ - -#define MIN_TILE_SIZE 16 -#define MAX_TILE_SIZE 2048 - -/* Constants as shifts for easier power-of-two iteration */ - -#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE) -#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE) - -/* The hierarchy has a 64-byte prologue */ -#define PROLOGUE_SIZE 0x40 - -/* For each tile (across all hierarchy levels), there is 8 bytes of header */ -#define HEADER_BYTES_PER_TILE 0x8 - -/* Absent any geometry, the minimum size of the header */ -#define MINIMUM_HEADER_SIZE 0x200 - -/* If the width-x-height framebuffer is divided into tile_size-x-tile_size - * tiles, how many tiles are there? Rounding up in each direction. For the - * special case of tile_size=16, this aligns with the usual Midgard count. - * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum, - * because those care about the stride (not just the overall count) and only at - * a a fixed-tile size (not any of a number of power-of-twos) */ - -static unsigned -pan_tile_count(unsigned width, unsigned height, unsigned tile_size) -{ - unsigned aligned_width = ALIGN_POT(width, tile_size); - unsigned aligned_height = ALIGN_POT(height, tile_size); - - unsigned tile_count_x = aligned_width / tile_size; - unsigned tile_count_y = aligned_height / tile_size; - - return tile_count_x * tile_count_y; -} - -/* For `masked_count` of the smallest tile sizes masked out, computes how the - * size of the polygon list header. We iterate the tile sizes (16x16 through - * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count) - * through 2048x2048 more generally. For each tile size, we figure out how many - * tiles there are at this hierarchy level and therefore many bytes this level - * is, leaving us with a byte count for each level. We then just sum up the - * byte counts across the levels to find a byte count for all levels. */ - -static unsigned -panfrost_raw_header_size(unsigned width, unsigned height, unsigned masked_count) -{ - unsigned size = PROLOGUE_SIZE; - - /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more - * if anything is masked off */ - - unsigned start_level = MIN_TILE_SHIFT + masked_count; - - /* Iterate hierarchy levels / tile sizes */ - - for (unsigned i = start_level; i < MAX_TILE_SHIFT; ++i) { - /* Shift from a level to a tile size */ - unsigned tile_size = (1 << i); - - unsigned tile_count = pan_tile_count(width, height, tile_size); - unsigned header_bytes = HEADER_BYTES_PER_TILE * tile_count; - - size += header_bytes; - } - - /* This size will be used as an offset, so ensure it's aligned */ - return ALIGN_POT(size, 512); -} - -/* Given a hierarchy mask and a framebuffer size, compute the header size */ - -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask) -{ - /* If no hierarchy levels are enabled, that means there is no geometry - * for the tiler to process, so use a minimum size. Used for clears */ - - if (mask == 0x00) - return MINIMUM_HEADER_SIZE; - - /* Some levels are enabled. Ensure that only smaller levels are - * disabled and there are no gaps. Theoretically the hardware is more - * flexible, but there's no known reason to use other configurations - * and this keeps the code simple. Since we know the 0x80 bit is set, - * ctz(mask) will return the number of masked off levels. */ - - unsigned masked_count = __builtin_ctz(mask); - - assert(mask & 0x80); - assert(((mask >> masked_count) & ((mask >> masked_count) + 1)) == 0); - - /* Everything looks good. Use the number of trailing zeroes we found to - * figure out how many smaller levels are disabled to compute the - * actual header size */ - - return panfrost_raw_header_size(width, height, masked_count); -} - -/* The body seems to be about 512 bytes per tile. Noting that the header is - * about 8 bytes per tile, we can be a little sloppy and estimate the body size - * to be equal to the header size * (512/8). Given the header size is a - * considerable overestimate, this is fine. Eventually, we should maybe figure - * out how to actually implement this. */ - -unsigned -panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask) -{ - /* No levels means no body */ - if (!mask) - return 0x00; - - unsigned header_size = panfrost_tiler_header_size(width, height, mask); - return ALIGN_POT(header_size * 512 / 8, 512); -} - - -/* In the future, a heuristic to choose a tiler hierarchy mask would go here. - * At the moment, we just default to 0xFF, which enables all possible hierarchy - * levels. Overall this yields good performance but presumably incurs a cost in - * memory bandwidth / power consumption / etc, at least on smaller scenes that - * don't really need all the smaller levels enabled */ - -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count) -{ - /* If there is no geometry, we don't bother enabling anything */ - - if (!vertex_count) - return 0x00; - - /* Otherwise, default everything on. TODO: Proper tests */ - - return 0xFF; -} diff --git a/src/gallium/drivers/panfrost/pan_tiler.h b/src/gallium/drivers/panfrost/pan_tiler.h deleted file mode 100644 index 8d7f6f29de0..00000000000 --- a/src/gallium/drivers/panfrost/pan_tiler.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - * - */ - -#ifndef __PAN_TILER_H__ -#define __PAN_TILER_H__ - -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask); - -unsigned -panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask); - -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count); - -#endif - - diff --git a/src/panfrost/encoder/meson.build b/src/panfrost/encoder/meson.build index 9f26139c913..007785769af 100644 --- a/src/panfrost/encoder/meson.build +++ b/src/panfrost/encoder/meson.build @@ -23,6 +23,7 @@ libpanfrost_encoder_files = files( 'pan_encoder.h', 'pan_invocation.c', + 'pan_tiler.c', ) libpanfrost_encoder = static_library( diff --git a/src/panfrost/encoder/pan_encoder.h b/src/panfrost/encoder/pan_encoder.h index aba3ebacf2c..0e135d0f511 100644 --- a/src/panfrost/encoder/pan_encoder.h +++ b/src/panfrost/encoder/pan_encoder.h @@ -53,4 +53,17 @@ panfrost_pack_work_groups_fused( unsigned size_y, unsigned size_z); +/* Tiler structure size computation */ + +unsigned +panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask); + +unsigned +panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask); + +unsigned +panfrost_choose_hierarchy_mask( + unsigned width, unsigned height, + unsigned vertex_count); + #endif diff --git a/src/panfrost/encoder/pan_tiler.c b/src/panfrost/encoder/pan_tiler.c new file mode 100644 index 00000000000..7718ad9fe48 --- /dev/null +++ b/src/panfrost/encoder/pan_tiler.c @@ -0,0 +1,295 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include "util/u_math.h" +#include "util/macros.h" +#include "pan_encoder.h" + +/* Mali GPUs are tiled-mode renderers, rather than immediate-mode. + * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run. + * Then, a fixed-function hardware block (the tiler) consumes the gl_Position + * results. For each triangle specified, it marks each containing tile as + * containing that triangle. This set of "triangles per tile" form the "polygon + * list". Finally, the rasterization unit consumes the polygon list to invoke + * the fragment shader. + * + * In practice, it's a bit more complicated than this. 16x16 is the logical + * tile size, but Midgard features "hierarchical tiling", where power-of-two + * multiples of the base tile size can be used: hierarchy level 0 (16x16), + * level 1 (32x32), level 2 (64x64), per public information about Midgard's + * tiling. In fact, tiling goes up to 2048x2048 (!), although in practice + * 128x128 is the largest usually used (though higher modes are enabled). The + * idea behind hierarchical tiling is to use low tiling levels for small + * triangles and high levels for large triangles, to minimize memory bandwidth + * and repeated fragment shader invocations (the former issue inherent to + * immediate-mode rendering and the latter common in traditional tilers). + * + * The tiler itself works by reading varyings in and writing a polygon list + * out. Unfortunately (for us), both of these buffers are managed in main + * memory; although they ideally will be cached, it is the drivers' + * responsibility to allocate these buffers. Varying buffer allocation is + * handled elsewhere, as it is not tiler specific; the real issue is allocating + * the polygon list. + * + * This is hard, because from the driver's perspective, we have no information + * about what geometry will actually look like on screen; that information is + * only gained from running the vertex shader. (Theoretically, we could run the + * vertex shaders in software as a prepass, or in hardware with transform + * feedback as a prepass, but either idea is ludicrous on so many levels). + * + * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list + * into three distinct pieces. First, the driver statically determines which + * tile hierarchy levels to use (more on that later). At this point, we know the + * framebuffer dimensions and all the possible tilings of the framebuffer, so + * we know exactly how many tiles exist across all hierarchy levels. The first + * piece of the polygon list is the header, which is exactly 8 bytes per tile, + * plus padding and a small 64-byte prologue. (If that doesn't remind you of + * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is + * the polygon list body, which seems to contain 512 bytes per tile, again + * across every level of the hierarchy. These two parts form the polygon list + * buffer. This buffer has a statically determinable size, approximately equal + * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus + * alignment / minimum restrictions / etc. + * + * The third piece is the easy one (for us): the tiler heap. In essence, the + * tiler heap is a gigantic slab that's as big as could possibly be necessary + * in the worst case imaginable. Just... a gigantic allocation that we give a + * start and end pointer to. What's the catch? The tiler heap is lazily + * allocated; that is, a huge amount of memory is _reserved_, but only a tiny + * bit is actually allocated upfront. The GPU just keeps using the + * unallocated-but-reserved portions as it goes along, generating page faults + * if it goes beyond the allocation, and then the kernel is instructed to + * expand the allocation on page fault (known in the vendor kernel as growable + * memory). This is quite a bit of bookkeeping of its own, but that task is + * pushed to kernel space and we can mostly ignore it here, just remembering to + * set the GROWABLE flag so the kernel actually uses this path rather than + * allocating a gigantic amount up front and burning a hole in RAM. + * + * As far as determining which hierarchy levels to use, the simple answer is + * that right now, we don't. In the tiler configuration fields (consistent from + * the earliest Midgard's SFBD through the latest Bifrost traces we have), + * there is a hierarchy_mask field, controlling which levels (tile sizes) are + * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to + * big tiles and small polygons to small tiles -- would be realized here as + * well. As long as there are polygons at all needing tiling, we always have to + * have big tiles available, in case there are big polygons. But we don't + * necessarily need small tiles available. Ideally, when there are small + * polygons, small tiles are enabled (to avoid waste from putting small + * triangles in the big tiles); when there are not, small tiles are disabled to + * avoid enabling more levels than necessary, which potentially costs in memory + * bandwidth / power / tiler performance. + * + * Of course, the driver has to figure this out statically. When tile + * hiearchies are actually established, this occurs by the tiler in + * fixed-function hardware, after the vertex shaders have run and there is + * sufficient information to figure out the size of triangles. The driver has + * no such luxury, again barring insane hacks like additionally running the + * vertex shaders in software or in hardware via transform feedback. Thus, for + * the driver, we need a heuristic approach. + * + * There are lots of heuristics to guess triangle size statically you could + * imagine, but one approach shines as particularly simple-stupid: assume all + * on-screen triangles are equal size and spread equidistantly throughout the + * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with + * it, then we see: + * + * Triangle Area = (Screen Area / # of triangles) + * = (Width * Height) / (# of triangles) + * + * Or if you prefer, we can also make a third CRAZY assumption that we only draw + * right triangles with edges parallel/perpendicular to the sides of the screen + * with no overdraw, forming a triangle grid across the screen: + * + * |--w--| + * _____ | + * | /| /| | + * |/_|/_| h + * | /| /| | + * |/_|/_| | + * + * Then you can use some middle school geometry and algebra to work out the + * triangle dimensions. I started working on this, but realised I didn't need + * to to make my point, but couldn't bare to erase that ASCII art. Anyway. + * + * POINT IS, by considering the ratio of screen area and triangle count, we can + * estimate the triangle size. For a small size, use small bins; for a large + * size, use large bins. Intuitively, this metric makes sense: when there are + * few triangles on a large screen, you're probably compositing a UI and + * therefore the triangles are large; when there are a lot of triangles on a + * small screen, you're probably rendering a 3D mesh and therefore the + * triangles are tiny. (Or better said -- there will be tiny triangles, even if + * there are also large triangles. There have to be unless you expect crazy + * overdraw. Generally, it's better to allow more small bin sizes than + * necessary than not allow enough.) + * + * From this heuristic (or whatever), we determine the minimum allowable tile + * size, and we use that to decide the hierarchy masking, selecting from the + * minimum "ideal" tile size to the maximum tile size (2048x2048). + * + * Once we have that mask and the framebuffer dimensions, we can compute the + * size of the statically-sized polygon list structures, allocate them, and go! + * + */ + +/* Hierarchical tiling spans from 16x16 to 2048x2048 tiles */ + +#define MIN_TILE_SIZE 16 +#define MAX_TILE_SIZE 2048 + +/* Constants as shifts for easier power-of-two iteration */ + +#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE) +#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE) + +/* The hierarchy has a 64-byte prologue */ +#define PROLOGUE_SIZE 0x40 + +/* For each tile (across all hierarchy levels), there is 8 bytes of header */ +#define HEADER_BYTES_PER_TILE 0x8 + +/* Absent any geometry, the minimum size of the header */ +#define MINIMUM_HEADER_SIZE 0x200 + +/* If the width-x-height framebuffer is divided into tile_size-x-tile_size + * tiles, how many tiles are there? Rounding up in each direction. For the + * special case of tile_size=16, this aligns with the usual Midgard count. + * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum, + * because those care about the stride (not just the overall count) and only at + * a a fixed-tile size (not any of a number of power-of-twos) */ + +static unsigned +pan_tile_count(unsigned width, unsigned height, unsigned tile_size) +{ + unsigned aligned_width = ALIGN_POT(width, tile_size); + unsigned aligned_height = ALIGN_POT(height, tile_size); + + unsigned tile_count_x = aligned_width / tile_size; + unsigned tile_count_y = aligned_height / tile_size; + + return tile_count_x * tile_count_y; +} + +/* For `masked_count` of the smallest tile sizes masked out, computes how the + * size of the polygon list header. We iterate the tile sizes (16x16 through + * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count) + * through 2048x2048 more generally. For each tile size, we figure out how many + * tiles there are at this hierarchy level and therefore many bytes this level + * is, leaving us with a byte count for each level. We then just sum up the + * byte counts across the levels to find a byte count for all levels. */ + +static unsigned +panfrost_raw_header_size(unsigned width, unsigned height, unsigned masked_count) +{ + unsigned size = PROLOGUE_SIZE; + + /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more + * if anything is masked off */ + + unsigned start_level = MIN_TILE_SHIFT + masked_count; + + /* Iterate hierarchy levels / tile sizes */ + + for (unsigned i = start_level; i < MAX_TILE_SHIFT; ++i) { + /* Shift from a level to a tile size */ + unsigned tile_size = (1 << i); + + unsigned tile_count = pan_tile_count(width, height, tile_size); + unsigned header_bytes = HEADER_BYTES_PER_TILE * tile_count; + + size += header_bytes; + } + + /* This size will be used as an offset, so ensure it's aligned */ + return ALIGN_POT(size, 512); +} + +/* Given a hierarchy mask and a framebuffer size, compute the header size */ + +unsigned +panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask) +{ + /* If no hierarchy levels are enabled, that means there is no geometry + * for the tiler to process, so use a minimum size. Used for clears */ + + if (mask == 0x00) + return MINIMUM_HEADER_SIZE; + + /* Some levels are enabled. Ensure that only smaller levels are + * disabled and there are no gaps. Theoretically the hardware is more + * flexible, but there's no known reason to use other configurations + * and this keeps the code simple. Since we know the 0x80 bit is set, + * ctz(mask) will return the number of masked off levels. */ + + unsigned masked_count = __builtin_ctz(mask); + + assert(mask & 0x80); + assert(((mask >> masked_count) & ((mask >> masked_count) + 1)) == 0); + + /* Everything looks good. Use the number of trailing zeroes we found to + * figure out how many smaller levels are disabled to compute the + * actual header size */ + + return panfrost_raw_header_size(width, height, masked_count); +} + +/* The body seems to be about 512 bytes per tile. Noting that the header is + * about 8 bytes per tile, we can be a little sloppy and estimate the body size + * to be equal to the header size * (512/8). Given the header size is a + * considerable overestimate, this is fine. Eventually, we should maybe figure + * out how to actually implement this. */ + +unsigned +panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask) +{ + /* No levels means no body */ + if (!mask) + return 0x00; + + unsigned header_size = panfrost_tiler_header_size(width, height, mask); + return ALIGN_POT(header_size * 512 / 8, 512); +} + + +/* In the future, a heuristic to choose a tiler hierarchy mask would go here. + * At the moment, we just default to 0xFF, which enables all possible hierarchy + * levels. Overall this yields good performance but presumably incurs a cost in + * memory bandwidth / power consumption / etc, at least on smaller scenes that + * don't really need all the smaller levels enabled */ + +unsigned +panfrost_choose_hierarchy_mask( + unsigned width, unsigned height, + unsigned vertex_count) +{ + /* If there is no geometry, we don't bother enabling anything */ + + if (!vertex_count) + return 0x00; + + /* Otherwise, default everything on. TODO: Proper tests */ + + return 0xFF; +}