2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
5 * Copyright (c) 2019 Collabora, Ltd.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
29 #include "pan_tiling.h"
31 /* This file implements software encode/decode of the tiling format used for
32 * textures and framebuffers primarily on Utgard GPUs. Names for this format
33 * include "Utgard-style tiling", "(Mali) swizzled textures", and
34 * "U-interleaved" (the former two names being used in the community
35 * Lima/Panfrost drivers; the latter name used internally at Arm).
36 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
37 * spatial locality, to improve cache locality in both horizontal and vertical
40 * This format is tiled: first, the image dimensions must be aligned to 16
41 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
42 * This size harmonizes with other properties of the GPU; on Midgard,
43 * framebuffer tiles are logically 16x16 (this is the tile size used in
44 * Transaction Elimination and the minimum tile size used in Hierarchical
45 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
46 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
49 * Within each 16x16 block, the bits are reordered according to this pattern:
51 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
53 * Basically, interleaving the X and Y bits, with XORs thrown in for every
56 * This is cheap to implement both encode/decode in both hardware and software.
57 * In hardware, lines are simply rerouted to reorder and some XOR gates are
58 * thrown in. Software has to be a bit more clever.
60 * In software, the trick is to divide the pattern into two lines:
62 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
63 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |
65 * That is, duplicate the bits of the Y and space out the bits of the X. The
66 * top line is a function only of Y, so it can be calculated once per row and
67 * stored in a register. The bottom line is simply X with the bits spaced out.
68 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
69 * mask pattern (abusing carry bits).
71 * This format is also supported on Midgard GPUs, where it *can* be used for
72 * textures and framebuffers. That said, in practice it is usually as a
73 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
74 * significantly more efficient than Utgard-style tiling and preferred for both
75 * textures and framebuffers, where possible. For unsupported texture types,
76 * for instance sRGB textures and framebuffers, this tiling scheme is used at a
77 * performance penalty, as AFBC is not compatible.
80 /* Given the lower 4-bits of the Y coordinate, we would like to
81 * duplicate every bit over. So instead of 0b1010, we would like
82 * 0b11001100. The idea is that for the bits in the solely Y place, we
83 * get a Y place, and the bits in the XOR place *also* get a Y. */
85 uint32_t bit_duplication
[16] = {
104 /* Space the bits out of a 4-bit nibble */
106 unsigned space_4
[16] = {
125 /* The scheme uses 16x16 tiles */
127 #define TILE_WIDTH 16
128 #define TILE_HEIGHT 16
129 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
131 /* An optimized routine to tile an aligned (width & 0xF == 0) bpp4 texture */
134 panfrost_store_tiled_image_bpp4(void *dst
, const void *src
,
135 const struct pipe_box
*box
,
139 /* Precompute the offset to the beginning of the first horizontal tile we're
140 * writing to, knowing that box->x is 16-aligned. Tiles themselves are
141 * stored linearly, so we get the X tile number by shifting and then
142 * multiply by the bytes per tile */
144 uint8_t *dest_start
= dst
+ ((box
->x
>> 4) * PIXELS_PER_TILE
* 4);
146 /* Iterate across the pixels we're trying to store in source-order */
148 for (int y
= box
->y
, src_y
= 0; src_y
< box
->height
; ++y
, ++src_y
) {
149 /* For each pixel in the destination image, figure out the part
150 * corresponding to the 16x16 block index */
152 int block_y
= y
& ~0x0f;
154 /* In pixel coordinates (where the origin is the top-left), (block_y, 0)
155 * is the top-left corner of the leftmost tile in this row. While pixels
156 * are reordered within a block, the blocks themselves are stored
157 * linearly, so multiplying block_y by the pixel stride of the
158 * destination image equals the byte offset of that top-left corner of
159 * the block this row is in */
161 uint32_t *dest
= (uint32_t *) (dest_start
+ (block_y
* dst_stride
));
163 /* The source is actually linear, so compute the byte offset to the start
164 * and end of this row in the source */
166 const uint32_t *source
= src
+ (src_y
* src_stride
);
167 const uint32_t *source_end
= source
+ box
->width
;
169 /* We want to duplicate the bits of the bottom nibble of Y */
170 unsigned expanded_y
= bit_duplication
[y
& 0xF];
172 /* Iterate the row in source order. In the outer loop, we iterate 16
173 * bytes tiles. After each tile, we increment dest to include the size of
174 * that tile in pixels. */
176 for (; source
< source_end
; dest
+= PIXELS_PER_TILE
) {
177 /* Within each tile, we iterate each of the 16 pixels in the row of
178 * the tile. This loop should be unrolled. */
180 for (int i
= 0; i
< 16; ++i
) {
181 /* We have the X component spaced out in space_x and we have the Y
182 * component duplicated. So we just XOR them together. The X bits
183 * get the XOR like the pattern needs. The Y bits are XORing with
184 * zero so this is a no-op */
186 unsigned index
= expanded_y
^ space_4
[i
];
188 /* Copy over the pixel */
189 dest
[index
] = *(source
++);
196 panfrost_access_tiled_image_generic(void *dst
, void *src
,
197 const struct pipe_box
*box
,
203 for (int y
= box
->y
, src_y
= 0; src_y
< box
->height
; ++y
, ++src_y
) {
204 int block_y
= y
& ~0x0f;
205 int block_start_s
= block_y
* dst_stride
;
206 int source_start
= src_y
* src_stride
;
208 unsigned expanded_y
= bit_duplication
[y
& 0xF];
210 for (int x
= box
->x
, src_x
= 0; src_x
< box
->width
; ++x
, ++src_x
) {
211 int block_x_s
= (x
>> 4) * 256;
213 unsigned index
= expanded_y
^ space_4
[x
& 0xF];
216 uint8_t *source
= &src8
[source_start
+ bpp
* src_x
];
217 uint8_t *dest
= dst
+ block_start_s
+ bpp
* (block_x_s
+ index
);
219 uint8_t *out
= is_store
? dest
: source
;
220 uint8_t *in
= is_store
? source
: dest
;
222 uint16_t *out16
= (uint16_t *) out
;
223 uint16_t *in16
= (uint16_t *) in
;
225 uint32_t *out32
= (uint32_t *) out
;
226 uint32_t *in32
= (uint32_t *) in
;
228 uint64_t *out64
= (uint64_t *) out
;
229 uint64_t *in64
= (uint64_t *) in
;
231 /* Write out 1-16 bytes. Written like this rather than a loop so the
232 * compiler can see what's going on */
272 unreachable("Invalid bpp in software tiling");
279 panfrost_store_tiled_image(void *dst
, const void *src
,
280 const struct pipe_box
*box
,
285 /* The optimized path is for aligned writes specifically */
287 if (box
->x
& 0xF || box
->width
& 0xF) {
288 panfrost_access_tiled_image_generic(dst
, (void *) src
, box
, dst_stride
, src_stride
, bpp
, TRUE
);
292 /* Attempt to use an optimized path if we have one */
296 panfrost_store_tiled_image_bpp4(dst
, (void *) src
, box
, dst_stride
, src_stride
);
299 panfrost_access_tiled_image_generic(dst
, (void *) src
, box
, dst_stride
, src_stride
, bpp
, TRUE
);
305 panfrost_load_tiled_image(void *dst
, const void *src
,
306 const struct pipe_box
*box
,
311 panfrost_access_tiled_image_generic((void *) src
, dst
, box
, src_stride
, dst_stride
, bpp
, FALSE
);