panfrost: Stub out hierarchy mask selection
[mesa.git] / src / gallium / drivers / panfrost / pan_tiler.c
1 /*
2 * Copyright (C) 2019 Collabora
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "util/u_math.h"
28 #include "util/macros.h"
29 #include "pan_tiler.h"
30
31 /* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
32 * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
33 * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
34 * results. For each triangle specified, it marks each containing tile as
35 * containing that triangle. This set of "triangles per tile" form the "polygon
36 * list". Finally, the rasterization unit consumes the polygon list to invoke
37 * the fragment shader.
38 *
39 * In practice, it's a bit more complicated than this. 16x16 is the logical
40 * tile size, but Midgard features "hierarchical tiling", where power-of-two
41 * multiples of the base tile size can be used: hierarchy level 0 (16x16),
42 * level 1 (32x32), level 2 (64x64), per public information about Midgard's
43 * tiling. In fact, tiling goes up to 2048x2048 (!), although in practice
44 * 128x128 is the largest usually used (though higher modes are enabled). The
45 * idea behind hierarchical tiling is to use low tiling levels for small
46 * triangles and high levels for large triangles, to minimize memory bandwidth
47 * and repeated fragment shader invocations (the former issue inherent to
48 * immediate-mode rendering and the latter common in traditional tilers).
49 *
50 * The tiler itself works by reading varyings in and writing a polygon list
51 * out. Unfortunately (for us), both of these buffers are managed in main
52 * memory; although they ideally will be cached, it is the drivers'
53 * responsibility to allocate these buffers. Varying buffe allocation is
54 * handled elsewhere, as it is not tiler specific; the real issue is allocating
55 * the polygon list.
56 *
57 * This is hard, because from the driver's perspective, we have no information
58 * about what geometry will actually look like on screen; that information is
59 * only gained from running the vertex shader. (Theoretically, we could run the
60 * vertex shaders in software as a prepass, or in hardware with transform
61 * feedback as a prepass, but either idea is ludicrous on so many levels).
62 *
63 * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
64 * into three distinct pieces. First, the driver statically determines which
65 * tile hierarchy levels to use (more on that later). At this point, we know the
66 * framebuffer dimensions and all the possible tilings of the framebuffer, so
67 * we know exactly how many tiles exist across all hierarchy levels. The first
68 * piece of the polygon list is the header, which is exactly 8 bytes per tile,
69 * plus padding and a small 64-byte prologue. (If that doesn't remind you of
70 * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
71 * the polygon list body, which seems to contain 512 bytes per tile, again
72 * across every level of the hierarchy. These two parts form the polygon list
73 * buffer. This buffer has a statically determinable size, approximately equal
74 * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
75 * alignment / minimum restrictions / etc.
76 *
77 * The third piece is the easy one (for us): the tiler heap. In essence, the
78 * tiler heap is a gigantic slab that's as big as could possibly be necessary
79 * in the worst case imaginable. Just... a gigantic allocation that we give a
80 * start and end pointer to. What's the catch? The tiler heap is lazily
81 * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
82 * bit is actually allocated upfront. The GPU just keeps using the
83 * unallocated-but-reserved portions as it goes along, generating page faults
84 * if it goes beyond the allocation, and then the kernel is instructed to
85 * expand the allocation on page fault (known in the vendor kernel as growable
86 * memory). This is quite a bit of bookkeeping of its own, but that task is
87 * pushed to kernel space and we can mostly ignore it here, just remembering to
88 * set the GROWABLE flag so the kernel actually uses this path rather than
89 * allocating a gigantic amount up front and burning a hole in RAM.
90 *
91 * As far as determining which hierarchy levels to use, the simple answer is
92 * that right now, we don't. In the tiler configuration fields (consistent from
93 * the earliest Midgard's SFBD through the latest Bifrost traces we have),
94 * there is a hierarchy_mask field, controlling which levels (tile sizes) are
95 * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
96 * big tiles and small polygons to small tiles -- would be realized here as
97 * well. As long as there are polygons at all needing tiling, we always have to
98 * have big tiles available, in case there are big polygons. But we don't
99 * necessarily need small tiles available. Ideally, when there are small
100 * polygons, small tiles are enabled (to avoid waste from putting small
101 * triangles in the big tiles); when there are not, small tiles are disabled to
102 * avoid enabling more levels than necessary, which potentially costs in memory
103 * bandwidth / power / tiler performance.
104 *
105 * Of course, the driver has to figure this out statically. When tile
106 * hiearchies are actually established, this occurs by the tiler in
107 * fixed-function hardware, after the vertex shaders have run and there is
108 * sufficient information to figure out the size of triangles. The driver has
109 * no such luxury, again barring insane hacks like additionally running the
110 * vertex shaders in software or in hardware via transform feedback. Thus, for
111 * the driver, we need a heuristic approach.
112 *
113 * There are lots of heuristics to guess triangle size statically you could
114 * imagine, but one approach shines as particularly simple-stupid: assume all
115 * on-screen triangles are equal size and spread equidistantly throughout the
116 * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
117 * it, then we see:
118 *
119 * Triangle Area = (Screen Area / # of triangles)
120 * = (Width * Height) / (# of triangles)
121 *
122 * Or if you prefer, we can also make a third CRAZY assumption that we only draw
123 * right triangles with edges parallel/perpendicular to the sides of the screen
124 * with no overdraw, forming a triangle grid across the screen:
125 *
126 * |--w--|
127 * _____ |
128 * | /| /| |
129 * |/_|/_| h
130 * | /| /| |
131 * |/_|/_| |
132 *
133 * Then you can use some middle school geometry and algebra to work out the
134 * triangle dimensions. I started working on this, but realised I didn't need
135 * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
136 *
137 * POINT IS, by considering the ratio of screen area and triangle count, we can
138 * estimate the triangle size. For a small size, use small bins; for a large
139 * size, use large bins. Intuitively, this metric makes sense: when there are
140 * few triangles on a large screen, you're probably compositing a UI and
141 * therefore the triangles are large; when there are a lot of triangles on a
142 * small screen, you're probably rendering a 3D mesh and therefore the
143 * triangles are tiny. (Or better said -- there will be tiny triangles, even if
144 * there are also large triangles. There have to be unless you expect crazy
145 * overdraw. Generally, it's better to allow more small bin sizes than
146 * necessary than not allow enough.)
147 *
148 * From this heuristic (or whatever), we determine the minimum allowable tile
149 * size, and we use that to decide the hierarchy masking, selecting from the
150 * minimum "ideal" tile size to the maximum tile size (2048x2048).
151 *
152 * Once we have that mask and the framebuffer dimensions, we can compute the
153 * size of the statically-sized polygon list structures, allocate them, and go!
154 *
155 */
156
157 /* Hierarchical tiling spans from 16x16 to 2048x2048 tiles */
158
159 #define MIN_TILE_SIZE 16
160 #define MAX_TILE_SIZE 2048
161
162 /* Constants as shifts for easier power-of-two iteration */
163
164 #define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
165 #define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)
166
167 /* The hierarchy has a 64-byte prologue */
168 #define PROLOGUE_SIZE 0x40
169
170 /* For each tile (across all hierarchy levels), there is 8 bytes of header */
171 #define HEADER_BYTES_PER_TILE 0x8
172
173 /* Absent any geometry, the minimum size of the header */
174 #define MINIMUM_HEADER_SIZE 0x200
175
176 /* If the width-x-height framebuffer is divided into tile_size-x-tile_size
177 * tiles, how many tiles are there? Rounding up in each direction. For the
178 * special case of tile_size=16, this aligns with the usual Midgard count.
179 * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
180 * because those care about the stride (not just the overall count) and only at
181 * a a fixed-tile size (not any of a number of power-of-twos) */
182
183 static unsigned
184 pan_tile_count(unsigned width, unsigned height, unsigned tile_size)
185 {
186 unsigned aligned_width = ALIGN_POT(width, tile_size);
187 unsigned aligned_height = ALIGN_POT(height, tile_size);
188
189 unsigned tile_count_x = aligned_width / tile_size;
190 unsigned tile_count_y = aligned_height / tile_size;
191
192 return tile_count_x * tile_count_y;
193 }
194
195 /* For `masked_count` of the smallest tile sizes masked out, computes how the
196 * size of the polygon list header. We iterate the tile sizes (16x16 through
197 * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count)
198 * through 2048x2048 more generally. For each tile size, we figure out how many
199 * tiles there are at this hierarchy level and therefore many bytes this level
200 * is, leaving us with a byte count for each level. We then just sum up the
201 * byte counts across the levels to find a byte count for all levels. */
202
203 static unsigned
204 panfrost_raw_header_size(unsigned width, unsigned height, unsigned masked_count)
205 {
206 unsigned size = PROLOGUE_SIZE;
207
208 /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more
209 * if anything is masked off */
210
211 unsigned start_level = MIN_TILE_SHIFT + masked_count;
212
213 /* Iterate hierarchy levels / tile sizes */
214
215 for (unsigned i = start_level; i < MAX_TILE_SHIFT; ++i) {
216 /* Shift from a level to a tile size */
217 unsigned tile_size = (1 << i);
218
219 unsigned tile_count = pan_tile_count(width, height, tile_size);
220 unsigned header_bytes = HEADER_BYTES_PER_TILE * tile_count;
221
222 size += header_bytes;
223 }
224
225 /* This size will be used as an offset, so ensure it's aligned */
226 return ALIGN_POT(size, 512);
227 }
228
229 /* Given a hierarchy mask and a framebuffer size, compute the header size */
230
231 unsigned
232 panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask)
233 {
234 /* If no hierarchy levels are enabled, that means there is no geometry
235 * for the tiler to process, so use a minimum size. Used for clears */
236
237 if (mask == 0x00)
238 return MINIMUM_HEADER_SIZE;
239
240 /* Some levels are enabled. Ensure that only smaller levels are
241 * disabled and there are no gaps. Theoretically the hardware is more
242 * flexible, but there's no known reason to use other configurations
243 * and this keeps the code simple. Since we know the 0x80 bit is set,
244 * ctz(mask) will return the number of masked off levels. */
245
246 unsigned masked_count = __builtin_ctz(mask);
247
248 assert(mask & 0x80);
249 assert(((mask >> masked_count) & ((mask >> masked_count) + 1)) == 0);
250
251 /* Everything looks good. Use the number of trailing zeroes we found to
252 * figure out how many smaller levels are disabled to compute the
253 * actual header size */
254
255 return panfrost_raw_header_size(width, height, masked_count);
256 }
257
258 /* The body seems to be about 512 bytes per tile. Noting that the header is
259 * about 8 bytes per tile, we can be a little sloppy and estimate the body size
260 * to be equal to the header size * (512/8). Given the header size is a
261 * considerable overestimate, this is fine. Eventually, we should maybe figure
262 * out how to actually implement this. */
263
264 unsigned
265 panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask)
266 {
267 unsigned header_size = panfrost_tiler_header_size(width, height, mask);
268 return ALIGN_POT(header_size * 512 / 8, 512);
269 }
270
271
272 /* In the future, a heuristic to choose a tiler hierarchy mask would go here.
273 * At the moment, we just default to 0xFF, which enables all possible hierarchy
274 * levels. Overall this yields good performance but presumably incurs a cost in
275 * memory bandwidth / power consumption / etc, at least on smaller scenes that
276 * don't really need all the smaller levels enabled */
277
278 unsigned
279 panfrost_choose_hierarchy_mask(
280 unsigned width, unsigned height,
281 unsigned vertex_count)
282 {
283 /* If there is no geometry, we don't bother enabling anything */
284
285 if (!vertex_count)
286 return 0x00;
287
288 /* Otherwise, default everything on. TODO: Proper tests */
289
290 return 0xFF;
291 }