98ef6827a8045a0b762b4a4e116680e92943cbf7
2 * Copyright (C) 2019 Collabora, Ltd.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
27 #include "util/u_math.h"
28 #include "util/macros.h"
29 #include "pan_encoder.h"
31 /* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
32 * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
33 * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
34 * results. For each triangle specified, it marks each containing tile as
35 * containing that triangle. This set of "triangles per tile" form the "polygon
36 * list". Finally, the rasterization unit consumes the polygon list to invoke
37 * the fragment shader.
39 * In practice, it's a bit more complicated than this. On Midgard chips with an
40 * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical
41 * tile size, but Midgard features "hierarchical tiling", where power-of-two
42 * multiples of the base tile size can be used: hierarchy level 0 (16x16),
43 * level 1 (32x32), level 2 (64x64), per public information about Midgard's
44 * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice
45 * 128x128 is the largest usually used (though higher modes are enabled). The
46 * idea behind hierarchical tiling is to use low tiling levels for small
47 * triangles and high levels for large triangles, to minimize memory bandwidth
48 * and repeated fragment shader invocations (the former issue inherent to
49 * immediate-mode rendering and the latter common in traditional tilers).
51 * The tiler itself works by reading varyings in and writing a polygon list
52 * out. Unfortunately (for us), both of these buffers are managed in main
53 * memory; although they ideally will be cached, it is the drivers'
54 * responsibility to allocate these buffers. Varying buffer allocation is
55 * handled elsewhere, as it is not tiler specific; the real issue is allocating
58 * This is hard, because from the driver's perspective, we have no information
59 * about what geometry will actually look like on screen; that information is
60 * only gained from running the vertex shader. (Theoretically, we could run the
61 * vertex shaders in software as a prepass, or in hardware with transform
62 * feedback as a prepass, but either idea is ludicrous on so many levels).
64 * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
65 * into three distinct pieces. First, the driver statically determines which
66 * tile hierarchy levels to use (more on that later). At this point, we know the
67 * framebuffer dimensions and all the possible tilings of the framebuffer, so
68 * we know exactly how many tiles exist across all hierarchy levels. The first
69 * piece of the polygon list is the header, which is exactly 8 bytes per tile,
70 * plus padding and a small 64-byte prologue. (If that doesn't remind you of
71 * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
72 * the polygon list body, which seems to contain 512 bytes per tile, again
73 * across every level of the hierarchy. These two parts form the polygon list
74 * buffer. This buffer has a statically determinable size, approximately equal
75 * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
76 * alignment / minimum restrictions / etc.
78 * The third piece is the easy one (for us): the tiler heap. In essence, the
79 * tiler heap is a gigantic slab that's as big as could possibly be necessary
80 * in the worst case imaginable. Just... a gigantic allocation that we give a
81 * start and end pointer to. What's the catch? The tiler heap is lazily
82 * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
83 * bit is actually allocated upfront. The GPU just keeps using the
84 * unallocated-but-reserved portions as it goes along, generating page faults
85 * if it goes beyond the allocation, and then the kernel is instructed to
86 * expand the allocation on page fault (known in the vendor kernel as growable
87 * memory). This is quite a bit of bookkeeping of its own, but that task is
88 * pushed to kernel space and we can mostly ignore it here, just remembering to
89 * set the GROWABLE flag so the kernel actually uses this path rather than
90 * allocating a gigantic amount up front and burning a hole in RAM.
92 * As far as determining which hierarchy levels to use, the simple answer is
93 * that right now, we don't. In the tiler configuration fields (consistent from
94 * the earliest Midgard's SFBD through the latest Bifrost traces we have),
95 * there is a hierarchy_mask field, controlling which levels (tile sizes) are
96 * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
97 * big tiles and small polygons to small tiles -- would be realized here as
98 * well. As long as there are polygons at all needing tiling, we always have to
99 * have big tiles available, in case there are big polygons. But we don't
100 * necessarily need small tiles available. Ideally, when there are small
101 * polygons, small tiles are enabled (to avoid waste from putting small
102 * triangles in the big tiles); when there are not, small tiles are disabled to
103 * avoid enabling more levels than necessary, which potentially costs in memory
104 * bandwidth / power / tiler performance.
106 * Of course, the driver has to figure this out statically. When tile
107 * hiearchies are actually established, this occurs by the tiler in
108 * fixed-function hardware, after the vertex shaders have run and there is
109 * sufficient information to figure out the size of triangles. The driver has
110 * no such luxury, again barring insane hacks like additionally running the
111 * vertex shaders in software or in hardware via transform feedback. Thus, for
112 * the driver, we need a heuristic approach.
114 * There are lots of heuristics to guess triangle size statically you could
115 * imagine, but one approach shines as particularly simple-stupid: assume all
116 * on-screen triangles are equal size and spread equidistantly throughout the
117 * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
120 * Triangle Area = (Screen Area / # of triangles)
121 * = (Width * Height) / (# of triangles)
123 * Or if you prefer, we can also make a third CRAZY assumption that we only draw
124 * right triangles with edges parallel/perpendicular to the sides of the screen
125 * with no overdraw, forming a triangle grid across the screen:
134 * Then you can use some middle school geometry and algebra to work out the
135 * triangle dimensions. I started working on this, but realised I didn't need
136 * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
138 * POINT IS, by considering the ratio of screen area and triangle count, we can
139 * estimate the triangle size. For a small size, use small bins; for a large
140 * size, use large bins. Intuitively, this metric makes sense: when there are
141 * few triangles on a large screen, you're probably compositing a UI and
142 * therefore the triangles are large; when there are a lot of triangles on a
143 * small screen, you're probably rendering a 3D mesh and therefore the
144 * triangles are tiny. (Or better said -- there will be tiny triangles, even if
145 * there are also large triangles. There have to be unless you expect crazy
146 * overdraw. Generally, it's better to allow more small bin sizes than
147 * necessary than not allow enough.)
149 * From this heuristic (or whatever), we determine the minimum allowable tile
150 * size, and we use that to decide the hierarchy masking, selecting from the
151 * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice).
153 * Once we have that mask and the framebuffer dimensions, we can compute the
154 * size of the statically-sized polygon list structures, allocate them, and go!
158 * On T720, T820, and T830, there is no support for hierarchical tiling.
159 * Instead, the hardware allows the driver to select the tile size dynamically
160 * on a per-framebuffer basis, including allowing rectangular/non-square tiles.
161 * Rules for tile size selection are as follows:
163 * - Dimensions must be powers-of-two.
164 * - The smallest tile is 16x16.
165 * - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix)
166 * - There must be no more than 64 tiles in either dimension.
168 * Within these constraints, the driver is free to pick a tile size according
169 * to some heuristic, similar to units with an advanced tiling unit.
171 * To pick a size without any heuristics, we may satisfy the constraints by
172 * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size
173 * constraint, consider:
176 * ceil (fb / tile) < 64
177 * (fb / tile) <= (64 - 1)
178 * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1))
180 * Hence we clamp up to align_pot(fb / (64 - 1)).
182 * Extending to use a selection heuristic left for future work.
184 * Once the tile size (w, h) is chosen, we compute the hierarchy "mask":
186 * hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16)
188 * Of course with no hierarchical tiling, this is not a mask; it's just a field
189 * specifying the tile size. But I digress.
191 * We also compute the polgon list sizes (with framebuffer size W, H) as:
193 * full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h)
194 * offset = 8 * ceil(W / w) * ceil(H / h)
196 * It further appears necessary to round down offset to the nearest 0x200.
197 * Possibly we would also round down full_size to the nearest 0x200 but
198 * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's
202 /* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */
204 #define MIN_TILE_SIZE 16
205 #define MAX_TILE_SIZE 4096
207 /* Constants as shifts for easier power-of-two iteration */
209 #define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
210 #define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)
212 /* The hierarchy has a 64-byte prologue */
213 #define PROLOGUE_SIZE 0x40
215 /* For each tile (across all hierarchy levels), there is 8 bytes of header */
216 #define HEADER_BYTES_PER_TILE 0x8
218 /* Likewise, each tile per level has 512 bytes of body */
219 #define FULL_BYTES_PER_TILE 0x200
221 /* Absent any geometry, the minimum size of the header */
222 #define MINIMUM_HEADER_SIZE 0x200
224 /* Mask of valid hierarchy levels: one bit for each level from min...max
226 #define HIERARCHY_MASK (((MAX_TILE_SIZE / MIN_TILE_SIZE) << 1) - 1)
228 /* If the width-x-height framebuffer is divided into tile_size-x-tile_size
229 * tiles, how many tiles are there? Rounding up in each direction. For the
230 * special case of tile_size=16, this aligns with the usual Midgard count.
231 * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
232 * because those care about the stride (not just the overall count) and only at
233 * a a fixed-tile size (not any of a number of power-of-twos) */
236 pan_tile_count(unsigned width
, unsigned height
, unsigned tile_size
)
238 unsigned aligned_width
= ALIGN_POT(width
, tile_size
);
239 unsigned aligned_height
= ALIGN_POT(height
, tile_size
);
241 unsigned tile_count_x
= aligned_width
/ tile_size
;
242 unsigned tile_count_y
= aligned_height
/ tile_size
;
244 return tile_count_x
* tile_count_y
;
247 /* For `masked_count` of the smallest tile sizes masked out, computes how the
248 * size of the polygon list header. We iterate the tile sizes (16x16 through
249 * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count)
250 * through 2048x2048 more generally. For each tile size, we figure out how many
251 * tiles there are at this hierarchy level and therefore many bytes this level
252 * is, leaving us with a byte count for each level. We then just sum up the
253 * byte counts across the levels to find a byte count for all levels. */
256 panfrost_raw_segment_size(
259 unsigned masked_count
,
261 unsigned bytes_per_tile
)
263 unsigned size
= PROLOGUE_SIZE
;
265 /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more
266 * if anything is masked off */
268 unsigned start_level
= MIN_TILE_SHIFT
+ masked_count
;
270 /* Iterate hierarchy levels / tile sizes */
272 for (unsigned i
= start_level
; i
<= end_level
; ++i
) {
273 /* Shift from a level to a tile size */
274 unsigned tile_size
= (1 << i
);
276 unsigned tile_count
= pan_tile_count(width
, height
, tile_size
);
277 unsigned level_count
= bytes_per_tile
* tile_count
;
282 /* This size will be used as an offset, so ensure it's aligned */
283 return ALIGN_POT(size
, 512);
286 /* Given a hierarchy mask and a framebuffer size, compute the size of one of
287 * the segments (header or body) */
290 panfrost_segment_size(
291 unsigned width
, unsigned height
,
292 unsigned mask
, unsigned bytes_per_tile
)
294 /* The tiler-disabled case should have been handled by the caller */
297 /* Some levels are enabled. Ensure that only smaller levels are
298 * disabled and there are no gaps. Theoretically the hardware is more
299 * flexible, but there's no known reason to use other configurations
300 * and this keeps the code simple. Since we know the 0x80 or 0x100 bit
301 * is set, ctz(mask) will return the number of masked off levels. */
303 unsigned masked_count
= __builtin_ctz(mask
);
305 assert(mask
& (0x80 | 0x100));
306 assert(((mask
>> masked_count
) & ((mask
>> masked_count
) + 1)) == 0);
308 /* Figure out the top level */
309 unsigned unused_count
= __builtin_clz(mask
);
310 unsigned top_bit
= ((8 * sizeof(mask
)) - 1) - unused_count
;
312 /* We don't have bits for nonexistant levels below 16x16 */
313 unsigned top_level
= top_bit
+ 4;
315 /* Everything looks good. Use the number of trailing zeroes we found to
316 * figure out how many smaller levels are disabled to compute the
317 * actual header size */
319 return panfrost_raw_segment_size(width
, height
,
320 masked_count
, top_level
, bytes_per_tile
);
324 /* Given a hierarchy mask and a framebuffer size, compute the header size */
327 panfrost_tiler_header_size(unsigned width
, unsigned height
, unsigned mask
)
329 mask
&= HIERARCHY_MASK
;
331 /* If no hierarchy levels are enabled, that means there is no geometry
332 * for the tiler to process, so use a minimum size. Used for clears */
335 return MINIMUM_HEADER_SIZE
;
337 return panfrost_segment_size(width
, height
, mask
, HEADER_BYTES_PER_TILE
);
340 /* The combined header/body is sized similarly (but it is significantly
341 * larger), except that it can be empty when the tiler disabled, rather than
342 * getting clamped to a minimum size.
346 panfrost_tiler_full_size(unsigned width
, unsigned height
, unsigned mask
)
348 mask
&= HIERARCHY_MASK
;
351 return MINIMUM_HEADER_SIZE
;
353 return panfrost_segment_size(width
, height
, mask
, FULL_BYTES_PER_TILE
);
356 /* In the future, a heuristic to choose a tiler hierarchy mask would go here.
357 * At the moment, we just default to 0xFF, which enables all possible hierarchy
358 * levels. Overall this yields good performance but presumably incurs a cost in
359 * memory bandwidth / power consumption / etc, at least on smaller scenes that
360 * don't really need all the smaller levels enabled */
363 panfrost_choose_hierarchy_mask(
364 unsigned width
, unsigned height
,
365 unsigned vertex_count
)
367 /* If there is no geometry, we don't bother enabling anything */
372 /* Otherwise, default everything on. TODO: Proper tests */