panfrost: Extend the tiled store fast-path to loads
[mesa.git] / src / panfrost / shared / pan_tiling.c
1 /*
2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
5 * Copyright (c) 2019 Collabora, Ltd.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 #include "pan_tiling.h"
29 #include <stdbool.h>
30 #include "util/macros.h"
31
32 /* This file implements software encode/decode of the tiling format used for
33 * textures and framebuffers primarily on Utgard GPUs. Names for this format
34 * include "Utgard-style tiling", "(Mali) swizzled textures", and
35 * "U-interleaved" (the former two names being used in the community
36 * Lima/Panfrost drivers; the latter name used internally at Arm).
37 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
38 * spatial locality, to improve cache locality in both horizontal and vertical
39 * directions.
40 *
41 * This format is tiled: first, the image dimensions must be aligned to 16
42 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
43 * This size harmonizes with other properties of the GPU; on Midgard,
44 * framebuffer tiles are logically 16x16 (this is the tile size used in
45 * Transaction Elimination and the minimum tile size used in Hierarchical
46 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
47 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
48 * size.
49 *
50 * Within each 16x16 block, the bits are reordered according to this pattern:
51 *
52 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
53 *
54 * Basically, interleaving the X and Y bits, with XORs thrown in for every
55 * adjacent bit pair.
56 *
57 * This is cheap to implement both encode/decode in both hardware and software.
58 * In hardware, lines are simply rerouted to reorder and some XOR gates are
59 * thrown in. Software has to be a bit more clever.
60 *
61 * In software, the trick is to divide the pattern into two lines:
62 *
63 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
64 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |
65 *
66 * That is, duplicate the bits of the Y and space out the bits of the X. The
67 * top line is a function only of Y, so it can be calculated once per row and
68 * stored in a register. The bottom line is simply X with the bits spaced out.
69 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
70 * mask pattern (abusing carry bits).
71 *
72 * This format is also supported on Midgard GPUs, where it *can* be used for
73 * textures and framebuffers. That said, in practice it is usually as a
74 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
75 * significantly more efficient than Utgard-style tiling and preferred for both
76 * textures and framebuffers, where possible. For unsupported texture types,
77 * for instance sRGB textures and framebuffers, this tiling scheme is used at a
78 * performance penalty, as AFBC is not compatible.
79 */
80
81 /* Given the lower 4-bits of the Y coordinate, we would like to
82 * duplicate every bit over. So instead of 0b1010, we would like
83 * 0b11001100. The idea is that for the bits in the solely Y place, we
84 * get a Y place, and the bits in the XOR place *also* get a Y. */
85
86 const uint32_t bit_duplication[16] = {
87 0b00000000,
88 0b00000011,
89 0b00001100,
90 0b00001111,
91 0b00110000,
92 0b00110011,
93 0b00111100,
94 0b00111111,
95 0b11000000,
96 0b11000011,
97 0b11001100,
98 0b11001111,
99 0b11110000,
100 0b11110011,
101 0b11111100,
102 0b11111111,
103 };
104
105 /* Space the bits out of a 4-bit nibble */
106
107 const unsigned space_4[16] = {
108 0b0000000,
109 0b0000001,
110 0b0000100,
111 0b0000101,
112 0b0010000,
113 0b0010001,
114 0b0010100,
115 0b0010101,
116 0b1000000,
117 0b1000001,
118 0b1000100,
119 0b1000101,
120 0b1010000,
121 0b1010001,
122 0b1010100,
123 0b1010101
124 };
125
126 /* The scheme uses 16x16 tiles */
127
128 #define TILE_WIDTH 16
129 #define TILE_HEIGHT 16
130 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
131
132 /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
133 * only support copies and sizeof, so emulating with a packed structure works
134 * well enough, but if there's a native 128-bit type we may we well prefer
135 * that. */
136
137 #ifdef __SIZEOF_INT128__
138 typedef __uint128_t pan_uint128_t;
139 #else
140 typedef struct {
141 uint64_t lo;
142 uint64_t hi;
143 } __attribute__((packed)) pan_uint128_t;
144 #endif
145
146 /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
147 *
148 * dest_start precomputes the offset to the beginning of the first horizontal
149 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
150 * stored linearly, so we get the X tile number by shifting and then multiply
151 * by the bytes per tile .
152 *
153 * We iterate across the pixels we're trying to store in source-order. For each
154 * row in the destination image, we figure out which row of 16x16 block we're
155 * in, by slicing off the lower 4-bits (block_y).
156 *
157 * dest then precomputes the location of the top-left corner of the block the
158 * row starts in. In pixel coordinates (where the origin is the top-left),
159 * (block_y, 0) is the top-left corner of the leftmost tile in this row. While
160 * pixels are reordered within a block, the blocks themselves are stored
161 * linearly, so multiplying block_y by the pixel stride of the destination
162 * image equals the byte offset of that top-left corner of the block this row
163 * is in.
164 *
165 * On the other hand, the source is linear so we compute the locations of the
166 * start and end of the row in the source by a simple linear addressing.
167 *
168 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
169 * y0] value. Since this is constant across a row, we look it up per-row and
170 * store in expanded_y.
171 *
172 * Finally, we iterate each row in source order. In the outer loop, we iterate
173 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
174 * be unrolled), calculating the index within the tile and writing.
175 */
176
177 #define TILED_ACCESS_TYPE(pixel_t, shift) \
178 static ALWAYS_INLINE void \
179 panfrost_access_tiled_image_##pixel_t \
180 (void *dst, void *src, \
181 uint16_t sx, uint16_t sy, \
182 uint16_t w, uint16_t h, \
183 uint32_t dst_stride, \
184 uint32_t src_stride, \
185 bool is_store) \
186 { \
187 uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
188 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
189 uint16_t block_y = y & ~0x0f; \
190 uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
191 pixel_t *source = src + (src_y * src_stride); \
192 pixel_t *source_end = source + w; \
193 unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
194 for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
195 for (uint8_t i = 0; i < 16; ++i) { \
196 unsigned index = expanded_y ^ (space_4[i] << shift); \
197 if (is_store) \
198 *((pixel_t *) (dest + index)) = *(source++); \
199 else \
200 *(source++) = *((pixel_t *) (dest + index)); \
201 } \
202 } \
203 } \
204 } \
205
206 TILED_ACCESS_TYPE(uint8_t, 0);
207 TILED_ACCESS_TYPE(uint16_t, 1);
208 TILED_ACCESS_TYPE(uint32_t, 2);
209 TILED_ACCESS_TYPE(uint64_t, 3);
210 TILED_ACCESS_TYPE(pan_uint128_t, 4);
211
212 #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
213 const unsigned mask = (1 << tile_shift) - 1; \
214 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
215 unsigned block_y = y & ~mask; \
216 unsigned block_start_s = block_y * dst_stride; \
217 unsigned source_start = src_y * src_stride; \
218 unsigned expanded_y = bit_duplication[y & mask]; \
219 \
220 for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
221 unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
222 unsigned index = expanded_y ^ space_4[x & mask]; \
223 uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
224 uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
225 \
226 pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
227 pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
228 *outp = *inp; \
229 } \
230 } \
231 }
232
233 #define TILED_UNALIGNED_TYPES(store, shift) { \
234 if (bpp == 8) \
235 TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
236 else if (bpp == 16) \
237 TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
238 else if (bpp == 32) \
239 TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
240 else if (bpp == 64) \
241 TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
242 else if (bpp == 128) \
243 TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
244 }
245
246 static void
247 panfrost_access_tiled_image_generic(void *dst, void *src,
248 unsigned sx, unsigned sy,
249 unsigned w, unsigned h,
250 uint32_t dst_stride,
251 uint32_t src_stride,
252 const struct util_format_description *desc,
253 bool _is_store)
254 {
255 unsigned bpp = desc->block.bits;
256
257 if (desc->block.width > 1) {
258 w = DIV_ROUND_UP(w, desc->block.width);
259 h = DIV_ROUND_UP(h, desc->block.height);
260
261 if (_is_store)
262 TILED_UNALIGNED_TYPES(true, 2)
263 else
264 TILED_UNALIGNED_TYPES(false, 2)
265 } else {
266 if (_is_store)
267 TILED_UNALIGNED_TYPES(true, 4)
268 else
269 TILED_UNALIGNED_TYPES(false, 4)
270 }
271 }
272
273 #define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
274
275 static ALWAYS_INLINE void
276 panfrost_access_tiled_image(void *dst, void *src,
277 unsigned x, unsigned y,
278 unsigned w, unsigned h,
279 uint32_t dst_stride,
280 uint32_t src_stride,
281 enum pipe_format format,
282 bool is_store)
283 {
284 const struct util_format_description *desc = util_format_description(format);
285
286 if (desc->block.width > 1) {
287 panfrost_access_tiled_image_generic(dst, (void *) src,
288 x, y, w, h,
289 dst_stride, src_stride, desc, is_store);
290
291 return;
292 }
293
294 unsigned bpp = desc->block.bits;
295 unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
296 unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
297 unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
298 unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
299
300 /* First, tile the top portion */
301
302 unsigned orig_x = x, orig_y = y;
303
304 if (first_full_tile_y != y) {
305 unsigned dist = MIN2(first_full_tile_y - y, h);
306
307 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
308 x, y, w, dist,
309 dst_stride, src_stride, desc, is_store);
310
311 if (dist == h)
312 return;
313
314 y += dist;
315 h -= dist;
316 }
317
318 /* Next, the bottom portion */
319 if (last_full_tile_y != (y + h)) {
320 unsigned dist = (y + h) - last_full_tile_y;
321
322 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
323 x, last_full_tile_y, w, dist,
324 dst_stride, src_stride, desc, is_store);
325
326 h -= dist;
327 }
328
329 /* The left portion */
330 if (first_full_tile_x != x) {
331 unsigned dist = MIN2(first_full_tile_x - x, w);
332
333 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
334 x, y, dist, h,
335 dst_stride, src_stride, desc, is_store);
336
337 if (dist == w)
338 return;
339
340 x += dist;
341 w -= dist;
342 }
343
344 /* Finally, the right portion */
345 if (last_full_tile_x != (x + w)) {
346 unsigned dist = (x + w) - last_full_tile_x;
347
348 panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
349 last_full_tile_x, y, dist, h,
350 dst_stride, src_stride, desc, is_store);
351
352 w -= dist;
353 }
354
355 if (bpp == 8)
356 panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
357 else if (bpp == 16)
358 panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
359 else if (bpp == 32)
360 panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
361 else if (bpp == 64)
362 panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
363 else if (bpp == 128)
364 panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
365 }
366
367 void
368 panfrost_store_tiled_image(void *dst, const void *src,
369 unsigned x, unsigned y,
370 unsigned w, unsigned h,
371 uint32_t dst_stride,
372 uint32_t src_stride,
373 enum pipe_format format)
374 {
375 panfrost_access_tiled_image(dst, (void *) src,
376 x, y, w, h,
377 dst_stride, src_stride, format, true);
378 }
379
380 void
381 panfrost_load_tiled_image(void *dst, const void *src,
382 unsigned x, unsigned y,
383 unsigned w, unsigned h,
384 uint32_t dst_stride,
385 uint32_t src_stride,
386 enum pipe_format format)
387 {
388 panfrost_access_tiled_image((void *) src, dst,
389 x, y, w, h,
390 src_stride, dst_stride, format, false);
391 }