src/panfrost/encoder/pan_tiler.c

   1 /*
   2  * Copyright (C) 2019 Collabora, Ltd.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
  25  */
  26
  27 #include "util/u_math.h"
  28 #include "util/macros.h"
  29 #include "pan_encoder.h"
  30
  31 /* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
  32  * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
  33  * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
  34  * results. For each triangle specified, it marks each containing tile as
  35  * containing that triangle. This set of "triangles per tile" form the "polygon
  36  * list". Finally, the rasterization unit consumes the polygon list to invoke
  37  * the fragment shader.
  38  *
  39  * In practice, it's a bit more complicated than this. On Midgard chips with an
  40  * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical
  41  * tile size, but Midgard features "hierarchical tiling", where power-of-two
  42  * multiples of the base tile size can be used: hierarchy level 0 (16x16),
  43  * level 1 (32x32), level 2 (64x64), per public information about Midgard's
  44  * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice
  45  * 128x128 is the largest usually used (though higher modes are enabled).  The
  46  * idea behind hierarchical tiling is to use low tiling levels for small
  47  * triangles and high levels for large triangles, to minimize memory bandwidth
  48  * and repeated fragment shader invocations (the former issue inherent to
  49  * immediate-mode rendering and the latter common in traditional tilers).
  50  *
  51  * The tiler itself works by reading varyings in and writing a polygon list
  52  * out. Unfortunately (for us), both of these buffers are managed in main
  53  * memory; although they ideally will be cached, it is the drivers'
  54  * responsibility to allocate these buffers. Varying buffer allocation is
  55  * handled elsewhere, as it is not tiler specific; the real issue is allocating
  56  * the polygon list.
  57  *
  58  * This is hard, because from the driver's perspective, we have no information
  59  * about what geometry will actually look like on screen; that information is
  60  * only gained from running the vertex shader. (Theoretically, we could run the
  61  * vertex shaders in software as a prepass, or in hardware with transform
  62  * feedback as a prepass, but either idea is ludicrous on so many levels).
  63  *
  64  * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
  65  * into three distinct pieces. First, the driver statically determines which
  66  * tile hierarchy levels to use (more on that later). At this point, we know the
  67  * framebuffer dimensions and all the possible tilings of the framebuffer, so
  68  * we know exactly how many tiles exist across all hierarchy levels. The first
  69  * piece of the polygon list is the header, which is exactly 8 bytes per tile,
  70  * plus padding and a small 64-byte prologue. (If that doesn't remind you of
  71  * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
  72  * the polygon list body, which seems to contain 512 bytes per tile, again
  73  * across every level of the hierarchy. These two parts form the polygon list
  74  * buffer. This buffer has a statically determinable size, approximately equal
  75  * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
  76  * alignment / minimum restrictions / etc.
  77  *
  78  * The third piece is the easy one (for us): the tiler heap. In essence, the
  79  * tiler heap is a gigantic slab that's as big as could possibly be necessary
  80  * in the worst case imaginable. Just... a gigantic allocation that we give a
  81  * start and end pointer to. What's the catch? The tiler heap is lazily
  82  * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
  83  * bit is actually allocated upfront. The GPU just keeps using the
  84  * unallocated-but-reserved portions as it goes along, generating page faults
  85  * if it goes beyond the allocation, and then the kernel is instructed to
  86  * expand the allocation on page fault (known in the vendor kernel as growable
  87  * memory). This is quite a bit of bookkeeping of its own, but that task is
  88  * pushed to kernel space and we can mostly ignore it here, just remembering to
  89  * set the GROWABLE flag so the kernel actually uses this path rather than
  90  * allocating a gigantic amount up front and burning a hole in RAM.
  91  *
  92  * As far as determining which hierarchy levels to use, the simple answer is
  93  * that right now, we don't. In the tiler configuration fields (consistent from
  94  * the earliest Midgard's SFBD through the latest Bifrost traces we have),
  95  * there is a hierarchy_mask field, controlling which levels (tile sizes) are
  96  * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
  97  * big tiles and small polygons to small tiles -- would be realized here as
  98  * well. As long as there are polygons at all needing tiling, we always have to
  99  * have big tiles available, in case there are big polygons. But we don't
 100  * necessarily need small tiles available. Ideally, when there are small
 101  * polygons, small tiles are enabled (to avoid waste from putting small
 102  * triangles in the big tiles); when there are not, small tiles are disabled to
 103  * avoid enabling more levels than necessary, which potentially costs in memory
 104  * bandwidth / power / tiler performance.
 105  *
 106  * Of course, the driver has to figure this out statically. When tile
 107  * hiearchies are actually established, this occurs by the tiler in
 108  * fixed-function hardware, after the vertex shaders have run and there is
 109  * sufficient information to figure out the size of triangles. The driver has
 110  * no such luxury, again barring insane hacks like additionally running the
 111  * vertex shaders in software or in hardware via transform feedback. Thus, for
 112  * the driver, we need a heuristic approach.
 113  *
 114  * There are lots of heuristics to guess triangle size statically you could
 115  * imagine, but one approach shines as particularly simple-stupid: assume all
 116  * on-screen triangles are equal size and spread equidistantly throughout the
 117  * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
 118  * it, then we see:
 119  *
 120  *      Triangle Area   = (Screen Area / # of triangles)
 121  *                      = (Width * Height) / (# of triangles)
 122  *
 123  * Or if you prefer, we can also make a third CRAZY assumption that we only draw
 124  * right triangles with edges parallel/perpendicular to the sides of the screen
 125  * with no overdraw, forming a triangle grid across the screen:
 126  *
 127  * |--w--|
 128  *  _____   |
 129  * | /| /|  |
 130  * |/_|/_|  h
 131  * | /| /|  |
 132  * |/_|/_|  |
 133  *
 134  * Then you can use some middle school geometry and algebra to work out the
 135  * triangle dimensions. I started working on this, but realised I didn't need
 136  * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
 137  *
 138  * POINT IS, by considering the ratio of screen area and triangle count, we can
 139  * estimate the triangle size. For a small size, use small bins; for a large
 140  * size, use large bins. Intuitively, this metric makes sense: when there are
 141  * few triangles on a large screen, you're probably compositing a UI and
 142  * therefore the triangles are large; when there are a lot of triangles on a
 143  * small screen, you're probably rendering a 3D mesh and therefore the
 144  * triangles are tiny. (Or better said -- there will be tiny triangles, even if
 145  * there are also large triangles. There have to be unless you expect crazy
 146  * overdraw. Generally, it's better to allow more small bin sizes than
 147  * necessary than not allow enough.)
 148  *
 149  * From this heuristic (or whatever), we determine the minimum allowable tile
 150  * size, and we use that to decide the hierarchy masking, selecting from the
 151  * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice).
 152  *
 153  * Once we have that mask and the framebuffer dimensions, we can compute the
 154  * size of the statically-sized polygon list structures, allocate them, and go!
 155  *
 156  * -----
 157  *
 158  * On T720, T820, and T830, there is no support for hierarchical tiling.
 159  * Instead, the hardware allows the driver to select the tile size dynamically
 160  * on a per-framebuffer basis, including allowing rectangular/non-square tiles.
 161  * Rules for tile size selection are as follows:
 162  *
 163  *  - Dimensions must be powers-of-two.
 164  *  - The smallest tile is 16x16.
 165  *  - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix)
 166  *  - There must be no more than 64 tiles in either dimension.
 167  *
 168  * Within these constraints, the driver is free to pick a tile size according
 169  * to some heuristic, similar to units with an advanced tiling unit.
 170  *
 171  * To pick a size without any heuristics, we may satisfy the constraints by
 172  * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size
 173  * constraint, consider:
 174  *
 175  *      # of tiles < 64
 176  *      ceil (fb / tile) < 64
 177  *      (fb / tile) <= (64 - 1)
 178  *      tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1))
 179  *
 180  * Hence we clamp up to align_pot(fb / (64 - 1)).
 181
 182  * Extending to use a selection heuristic left for future work.
 183  *
 184  * Once the tile size (w, h) is chosen, we compute the hierarchy "mask":
 185  *
 186  *      hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16)
 187  *
 188  * Of course with no hierarchical tiling, this is not a mask; it's just a field
 189  * specifying the tile size. But I digress.
 190  *
 191  * We also compute the polgon list sizes (with framebuffer size W, H) as:
 192  *
 193  *      full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h)
 194  *      offset = 8 * ceil(W / w) * ceil(H / h)
 195  *
 196  * It further appears necessary to round down offset to the nearest 0x200.
 197  * Possibly we would also round down full_size to the nearest 0x200 but
 198  * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's
 199  * nothing to do.
 200  */
 201
 202 /* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */
 203
 204 #define MIN_TILE_SIZE 16
 205 #define MAX_TILE_SIZE 4096
 206
 207 /* Constants as shifts for easier power-of-two iteration */
 208
 209 #define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
 210 #define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)
 211
 212 /* The hierarchy has a 64-byte prologue */
 213 #define PROLOGUE_SIZE 0x40
 214
 215 /* For each tile (across all hierarchy levels), there is 8 bytes of header */
 216 #define HEADER_BYTES_PER_TILE 0x8
 217
 218 /* Likewise, each tile per level has 512 bytes of body */
 219 #define FULL_BYTES_PER_TILE 0x200
 220
 221 /* Absent any geometry, the minimum size of the header */
 222 #define MINIMUM_HEADER_SIZE 0x200
 223
 224 /* Mask of valid hierarchy levels: one bit for each level from min...max
 225  * inclusive */
 226 #define HIERARCHY_MASK (((MAX_TILE_SIZE / MIN_TILE_SIZE) << 1) - 1)
 227
 228 /* If the width-x-height framebuffer is divided into tile_size-x-tile_size
 229  * tiles, how many tiles are there? Rounding up in each direction. For the
 230  * special case of tile_size=16, this aligns with the usual Midgard count.
 231  * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
 232  * because those care about the stride (not just the overall count) and only at
 233  * a a fixed-tile size (not any of a number of power-of-twos) */
 234
 235 static unsigned
 236 pan_tile_count(unsigned width, unsigned height, unsigned tile_size)
 237 {
 238         unsigned aligned_width = ALIGN_POT(width, tile_size);
 239         unsigned aligned_height = ALIGN_POT(height, tile_size);
 240
 241         unsigned tile_count_x = aligned_width / tile_size;
 242         unsigned tile_count_y = aligned_height / tile_size;
 243
 244         return tile_count_x * tile_count_y;
 245 }
 246
 247 /* For `masked_count` of the smallest tile sizes masked out, computes how the
 248  * size of the polygon list header. We iterate the tile sizes (16x16 through
 249  * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count)
 250  * through 2048x2048 more generally. For each tile size, we figure out how many
 251  * tiles there are at this hierarchy level and therefore many bytes this level
 252  * is, leaving us with a byte count for each level. We then just sum up the
 253  * byte counts across the levels to find a byte count for all levels. */
 254
 255 static unsigned
 256 panfrost_raw_segment_size(
 257                 unsigned width,
 258                 unsigned height,
 259                 unsigned masked_count,
 260                 unsigned end_level,
 261                 unsigned bytes_per_tile)
 262 {
 263         unsigned size = PROLOGUE_SIZE;
 264
 265         /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more
 266          * if anything is masked off */
 267
 268         unsigned start_level = MIN_TILE_SHIFT + masked_count;
 269
 270         /* Iterate hierarchy levels / tile sizes */
 271
 272         for (unsigned i = start_level; i <= end_level; ++i) {
 273                 /* Shift from a level to a tile size */
 274                 unsigned tile_size = (1 << i);
 275
 276                 unsigned tile_count = pan_tile_count(width, height, tile_size);
 277                 unsigned level_count = bytes_per_tile * tile_count;
 278
 279                 size += level_count;
 280         }
 281
 282         /* This size will be used as an offset, so ensure it's aligned */
 283         return ALIGN_POT(size, 512);
 284 }
 285
 286 /* Given a hierarchy mask and a framebuffer size, compute the size of one of
 287  * the segments (header or body) */
 288
 289 static unsigned
 290 panfrost_segment_size(
 291                 unsigned width, unsigned height,
 292                 unsigned mask, unsigned bytes_per_tile)
 293 {
 294         /* The tiler-disabled case should have been handled by the caller */
 295         assert(mask);
 296
 297         /* Some levels are enabled. Ensure that only smaller levels are
 298          * disabled and there are no gaps. Theoretically the hardware is more
 299          * flexible, but there's no known reason to use other configurations
 300          * and this keeps the code simple. Since we know the 0x80 or 0x100 bit
 301          * is set, ctz(mask) will return the number of masked off levels. */
 302
 303         unsigned masked_count = __builtin_ctz(mask);
 304
 305         assert(mask & (0x80 | 0x100));
 306         assert(((mask >> masked_count) & ((mask >> masked_count) + 1)) == 0);
 307
 308         /* Figure out the top level */
 309         unsigned unused_count = __builtin_clz(mask);
 310         unsigned top_bit = ((8 * sizeof(mask)) - 1) - unused_count;
 311
 312         /* We don't have bits for nonexistant levels below 16x16 */
 313         unsigned top_level = top_bit + 4;
 314
 315         /* Everything looks good. Use the number of trailing zeroes we found to
 316          * figure out how many smaller levels are disabled to compute the
 317          * actual header size */
 318
 319         return panfrost_raw_segment_size(width, height,
 320                         masked_count, top_level, bytes_per_tile);
 321 }
 322
 323
 324 /* Given a hierarchy mask and a framebuffer size, compute the header size */
 325
 326 unsigned
 327 panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask)
 328 {
 329         mask &= HIERARCHY_MASK;
 330
 331         /* If no hierarchy levels are enabled, that means there is no geometry
 332          * for the tiler to process, so use a minimum size. Used for clears */
 333
 334         if (mask == 0x00)
 335                 return MINIMUM_HEADER_SIZE;
 336
 337         return panfrost_segment_size(width, height, mask, HEADER_BYTES_PER_TILE);
 338 }
 339
 340 /* The combined header/body is sized similarly (but it is significantly
 341  * larger), except that it can be empty when the tiler disabled, rather than
 342  * getting clamped to a minimum size.
 343  */
 344
 345 unsigned
 346 panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask)
 347 {
 348         mask &= HIERARCHY_MASK;
 349
 350         if (mask == 0x00)
 351                 return MINIMUM_HEADER_SIZE;
 352
 353         return panfrost_segment_size(width, height, mask, FULL_BYTES_PER_TILE);
 354 }
 355
 356 /* In the future, a heuristic to choose a tiler hierarchy mask would go here.
 357  * At the moment, we just default to 0xFF, which enables all possible hierarchy
 358  * levels. Overall this yields good performance but presumably incurs a cost in
 359  * memory bandwidth / power consumption / etc, at least on smaller scenes that
 360  * don't really need all the smaller levels enabled */
 361
 362 unsigned
 363 panfrost_choose_hierarchy_mask(
 364         unsigned width, unsigned height,
 365         unsigned vertex_count)
 366 {
 367         /* If there is no geometry, we don't bother enabling anything */
 368
 369         if (!vertex_count)
 370                 return 0x00;
 371
 372         /* Otherwise, default everything on. TODO: Proper tests */
 373
 374         return 0xFF;
 375 }