2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
5 * Copyright (c) 2019 Collabora, Ltd.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
28 #include "pan_tiling.h"
30 #include "util/macros.h"
32 /* This file implements software encode/decode of the tiling format used for
33 * textures and framebuffers primarily on Utgard GPUs. Names for this format
34 * include "Utgard-style tiling", "(Mali) swizzled textures", and
35 * "U-interleaved" (the former two names being used in the community
36 * Lima/Panfrost drivers; the latter name used internally at Arm).
37 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
38 * spatial locality, to improve cache locality in both horizontal and vertical
41 * This format is tiled: first, the image dimensions must be aligned to 16
42 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
43 * This size harmonizes with other properties of the GPU; on Midgard,
44 * framebuffer tiles are logically 16x16 (this is the tile size used in
45 * Transaction Elimination and the minimum tile size used in Hierarchical
46 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
47 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
50 * Within each 16x16 block, the bits are reordered according to this pattern:
52 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
54 * Basically, interleaving the X and Y bits, with XORs thrown in for every
57 * This is cheap to implement both encode/decode in both hardware and software.
58 * In hardware, lines are simply rerouted to reorder and some XOR gates are
59 * thrown in. Software has to be a bit more clever.
61 * In software, the trick is to divide the pattern into two lines:
63 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
64 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |
66 * That is, duplicate the bits of the Y and space out the bits of the X. The
67 * top line is a function only of Y, so it can be calculated once per row and
68 * stored in a register. The bottom line is simply X with the bits spaced out.
69 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
70 * mask pattern (abusing carry bits).
72 * This format is also supported on Midgard GPUs, where it *can* be used for
73 * textures and framebuffers. That said, in practice it is usually as a
74 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
75 * significantly more efficient than Utgard-style tiling and preferred for both
76 * textures and framebuffers, where possible. For unsupported texture types,
77 * for instance sRGB textures and framebuffers, this tiling scheme is used at a
78 * performance penalty, as AFBC is not compatible.
81 /* Given the lower 4-bits of the Y coordinate, we would like to
82 * duplicate every bit over. So instead of 0b1010, we would like
83 * 0b11001100. The idea is that for the bits in the solely Y place, we
84 * get a Y place, and the bits in the XOR place *also* get a Y. */
86 const uint32_t bit_duplication
[16] = {
105 /* Space the bits out of a 4-bit nibble */
107 const unsigned space_4
[16] = {
126 /* The scheme uses 16x16 tiles */
128 #define TILE_WIDTH 16
129 #define TILE_HEIGHT 16
130 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
132 /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
133 * only support copies and sizeof, so emulating with a packed structure works
134 * well enough, but if there's a native 128-bit type we may we well prefer
137 #ifdef __SIZEOF_INT128__
138 typedef __uint128_t pan_uint128_t
;
143 } __attribute__((packed
)) pan_uint128_t
;
149 } __attribute__((packed
)) pan_uint24_t
;
151 /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
153 * dest_start precomputes the offset to the beginning of the first horizontal
154 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
155 * stored linearly, so we get the X tile number by shifting and then multiply
156 * by the bytes per tile .
158 * We iterate across the pixels we're trying to store in source-order. For each
159 * row in the destination image, we figure out which row of 16x16 block we're
160 * in, by slicing off the lower 4-bits (block_y).
162 * dest then precomputes the location of the top-left corner of the block the
163 * row starts in. In pixel coordinates (where the origin is the top-left),
164 * (block_y, 0) is the top-left corner of the leftmost tile in this row. While
165 * pixels are reordered within a block, the blocks themselves are stored
166 * linearly, so multiplying block_y by the pixel stride of the destination
167 * image equals the byte offset of that top-left corner of the block this row
170 * On the other hand, the source is linear so we compute the locations of the
171 * start and end of the row in the source by a simple linear addressing.
173 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
174 * y0] value. Since this is constant across a row, we look it up per-row and
175 * store in expanded_y.
177 * Finally, we iterate each row in source order. In the outer loop, we iterate
178 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
179 * be unrolled), calculating the index within the tile and writing.
182 #define TILED_ACCESS_TYPE(pixel_t, shift) \
183 static ALWAYS_INLINE void \
184 panfrost_access_tiled_image_##pixel_t \
185 (void *dst, void *src, \
186 uint16_t sx, uint16_t sy, \
187 uint16_t w, uint16_t h, \
188 uint32_t dst_stride, \
189 uint32_t src_stride, \
192 uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
193 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
194 uint16_t block_y = y & ~0x0f; \
195 uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
196 pixel_t *source = src + (src_y * src_stride); \
197 pixel_t *source_end = source + w; \
198 unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
199 for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
200 for (uint8_t i = 0; i < 16; ++i) { \
201 unsigned index = expanded_y ^ (space_4[i] << shift); \
203 *((pixel_t *) (dest + index)) = *(source++); \
205 *(source++) = *((pixel_t *) (dest + index)); \
211 TILED_ACCESS_TYPE(uint8_t, 0);
212 TILED_ACCESS_TYPE(uint16_t, 1);
213 TILED_ACCESS_TYPE(uint32_t, 2);
214 TILED_ACCESS_TYPE(uint64_t, 3);
215 TILED_ACCESS_TYPE(pan_uint128_t
, 4);
217 #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
218 const unsigned mask = (1 << tile_shift) - 1; \
219 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
220 unsigned block_y = y & ~mask; \
221 unsigned block_start_s = block_y * dst_stride; \
222 unsigned source_start = src_y * src_stride; \
223 unsigned expanded_y = bit_duplication[y & mask]; \
225 for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
226 unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
227 unsigned index = expanded_y ^ space_4[x & mask]; \
228 uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
229 uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
231 pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
232 pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
238 #define TILED_UNALIGNED_TYPES(store, shift) { \
240 TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
241 else if (bpp == 16) \
242 TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
243 else if (bpp == 24) \
244 TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \
245 else if (bpp == 32) \
246 TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
247 else if (bpp == 64) \
248 TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
249 else if (bpp == 128) \
250 TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
254 panfrost_access_tiled_image_generic(void *dst
, void *src
,
255 unsigned sx
, unsigned sy
,
256 unsigned w
, unsigned h
,
259 const struct util_format_description
*desc
,
262 unsigned bpp
= desc
->block
.bits
;
264 if (desc
->block
.width
> 1) {
265 w
= DIV_ROUND_UP(w
, desc
->block
.width
);
266 h
= DIV_ROUND_UP(h
, desc
->block
.height
);
269 TILED_UNALIGNED_TYPES(true, 2)
271 TILED_UNALIGNED_TYPES(false, 2)
274 TILED_UNALIGNED_TYPES(true, 4)
276 TILED_UNALIGNED_TYPES(false, 4)
280 #define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
282 static ALWAYS_INLINE
void
283 panfrost_access_tiled_image(void *dst
, void *src
,
284 unsigned x
, unsigned y
,
285 unsigned w
, unsigned h
,
288 enum pipe_format format
,
291 const struct util_format_description
*desc
= util_format_description(format
);
293 if (desc
->block
.width
> 1 || desc
->block
.bits
== 24) {
294 panfrost_access_tiled_image_generic(dst
, (void *) src
,
296 dst_stride
, src_stride
, desc
, is_store
);
301 unsigned bpp
= desc
->block
.bits
;
302 unsigned first_full_tile_x
= DIV_ROUND_UP(x
, TILE_WIDTH
) * TILE_WIDTH
;
303 unsigned first_full_tile_y
= DIV_ROUND_UP(y
, TILE_HEIGHT
) * TILE_HEIGHT
;
304 unsigned last_full_tile_x
= ((x
+ w
) / TILE_WIDTH
) * TILE_WIDTH
;
305 unsigned last_full_tile_y
= ((y
+ h
) / TILE_HEIGHT
) * TILE_HEIGHT
;
307 /* First, tile the top portion */
309 unsigned orig_x
= x
, orig_y
= y
;
311 if (first_full_tile_y
!= y
) {
312 unsigned dist
= MIN2(first_full_tile_y
- y
, h
);
314 panfrost_access_tiled_image_generic(dst
, OFFSET(src
, x
, y
),
316 dst_stride
, src_stride
, desc
, is_store
);
325 /* Next, the bottom portion */
326 if (last_full_tile_y
!= (y
+ h
)) {
327 unsigned dist
= (y
+ h
) - last_full_tile_y
;
329 panfrost_access_tiled_image_generic(dst
, OFFSET(src
, x
, last_full_tile_y
),
330 x
, last_full_tile_y
, w
, dist
,
331 dst_stride
, src_stride
, desc
, is_store
);
336 /* The left portion */
337 if (first_full_tile_x
!= x
) {
338 unsigned dist
= MIN2(first_full_tile_x
- x
, w
);
340 panfrost_access_tiled_image_generic(dst
, OFFSET(src
, x
, y
),
342 dst_stride
, src_stride
, desc
, is_store
);
351 /* Finally, the right portion */
352 if (last_full_tile_x
!= (x
+ w
)) {
353 unsigned dist
= (x
+ w
) - last_full_tile_x
;
355 panfrost_access_tiled_image_generic(dst
, OFFSET(src
, last_full_tile_x
, y
),
356 last_full_tile_x
, y
, dist
, h
,
357 dst_stride
, src_stride
, desc
, is_store
);
363 panfrost_access_tiled_image_uint8_t(dst
, OFFSET(src
, x
, y
), x
, y
, w
, h
, dst_stride
, src_stride
, is_store
);
365 panfrost_access_tiled_image_uint16_t(dst
, OFFSET(src
, x
, y
), x
, y
, w
, h
, dst_stride
, src_stride
, is_store
);
367 panfrost_access_tiled_image_uint32_t(dst
, OFFSET(src
, x
, y
), x
, y
, w
, h
, dst_stride
, src_stride
, is_store
);
369 panfrost_access_tiled_image_uint64_t(dst
, OFFSET(src
, x
, y
), x
, y
, w
, h
, dst_stride
, src_stride
, is_store
);
371 panfrost_access_tiled_image_pan_uint128_t(dst
, OFFSET(src
, x
, y
), x
, y
, w
, h
, dst_stride
, src_stride
, is_store
);
375 panfrost_store_tiled_image(void *dst
, const void *src
,
376 unsigned x
, unsigned y
,
377 unsigned w
, unsigned h
,
380 enum pipe_format format
)
382 panfrost_access_tiled_image(dst
, (void *) src
,
384 dst_stride
, src_stride
, format
, true);
388 panfrost_load_tiled_image(void *dst
, const void *src
,
389 unsigned x
, unsigned y
,
390 unsigned w
, unsigned h
,
393 enum pipe_format format
)
395 panfrost_access_tiled_image((void *) src
, dst
,
397 src_stride
, dst_stride
, format
, false);