panfrost: Compile tiling routines with -O3
[mesa.git] / src / panfrost / shared / pan_tiling.c
1 /*
2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
5 * Copyright (c) 2019 Collabora, Ltd.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 #include <stdbool.h>
29 #include "pan_tiling.h"
30
31 /* This file implements software encode/decode of the tiling format used for
32 * textures and framebuffers primarily on Utgard GPUs. Names for this format
33 * include "Utgard-style tiling", "(Mali) swizzled textures", and
34 * "U-interleaved" (the former two names being used in the community
35 * Lima/Panfrost drivers; the latter name used internally at Arm).
36 * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D
37 * spatial locality, to improve cache locality in both horizontal and vertical
38 * directions.
39 *
40 * This format is tiled: first, the image dimensions must be aligned to 16
41 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles.
42 * This size harmonizes with other properties of the GPU; on Midgard,
43 * framebuffer tiles are logically 16x16 (this is the tile size used in
44 * Transaction Elimination and the minimum tile size used in Hierarchical
45 * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like
46 * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line
47 * size.
48 *
49 * Within each 16x16 block, the bits are reordered according to this pattern:
50 *
51 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
52 *
53 * Basically, interleaving the X and Y bits, with XORs thrown in for every
54 * adjacent bit pair.
55 *
56 * This is cheap to implement both encode/decode in both hardware and software.
57 * In hardware, lines are simply rerouted to reorder and some XOR gates are
58 * thrown in. Software has to be a bit more clever.
59 *
60 * In software, the trick is to divide the pattern into two lines:
61 *
62 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
63 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |
64 *
65 * That is, duplicate the bits of the Y and space out the bits of the X. The
66 * top line is a function only of Y, so it can be calculated once per row and
67 * stored in a register. The bottom line is simply X with the bits spaced out.
68 * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the
69 * mask pattern (abusing carry bits).
70 *
71 * This format is also supported on Midgard GPUs, where it *can* be used for
72 * textures and framebuffers. That said, in practice it is usually as a
73 * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is
74 * significantly more efficient than Utgard-style tiling and preferred for both
75 * textures and framebuffers, where possible. For unsupported texture types,
76 * for instance sRGB textures and framebuffers, this tiling scheme is used at a
77 * performance penalty, as AFBC is not compatible.
78 */
79
80 /* Given the lower 4-bits of the Y coordinate, we would like to
81 * duplicate every bit over. So instead of 0b1010, we would like
82 * 0b11001100. The idea is that for the bits in the solely Y place, we
83 * get a Y place, and the bits in the XOR place *also* get a Y. */
84
85 uint32_t bit_duplication[16] = {
86 0b00000000,
87 0b00000011,
88 0b00001100,
89 0b00001111,
90 0b00110000,
91 0b00110011,
92 0b00111100,
93 0b00111111,
94 0b11000000,
95 0b11000011,
96 0b11001100,
97 0b11001111,
98 0b11110000,
99 0b11110011,
100 0b11111100,
101 0b11111111,
102 };
103
104 /* Space the bits out of a 4-bit nibble */
105
106 unsigned space_4[16] = {
107 0b0000000,
108 0b0000001,
109 0b0000100,
110 0b0000101,
111 0b0010000,
112 0b0010001,
113 0b0010100,
114 0b0010101,
115 0b1000000,
116 0b1000001,
117 0b1000100,
118 0b1000101,
119 0b1010000,
120 0b1010001,
121 0b1010100,
122 0b1010101
123 };
124
125 /* The scheme uses 16x16 tiles */
126
127 #define TILE_WIDTH 16
128 #define TILE_HEIGHT 16
129 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
130
131 /* An optimized routine to tile an aligned (width & 0xF == 0) bpp4 texture */
132
133 static void
134 panfrost_store_tiled_image_bpp4(void *dst, const void *src,
135 const struct pipe_box *box,
136 uint32_t dst_stride,
137 uint32_t src_stride)
138 {
139 /* Precompute the offset to the beginning of the first horizontal tile we're
140 * writing to, knowing that box->x is 16-aligned. Tiles themselves are
141 * stored linearly, so we get the X tile number by shifting and then
142 * multiply by the bytes per tile */
143
144 uint8_t *dest_start = dst + ((box->x >> 4) * PIXELS_PER_TILE * 4);
145
146 /* Iterate across the pixels we're trying to store in source-order */
147
148 for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) {
149 /* For each pixel in the destination image, figure out the part
150 * corresponding to the 16x16 block index */
151
152 int block_y = y & ~0x0f;
153
154 /* In pixel coordinates (where the origin is the top-left), (block_y, 0)
155 * is the top-left corner of the leftmost tile in this row. While pixels
156 * are reordered within a block, the blocks themselves are stored
157 * linearly, so multiplying block_y by the pixel stride of the
158 * destination image equals the byte offset of that top-left corner of
159 * the block this row is in */
160
161 uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride));
162
163 /* The source is actually linear, so compute the byte offset to the start
164 * and end of this row in the source */
165
166 const uint32_t *source = src + (src_y * src_stride);
167 const uint32_t *source_end = source + box->width;
168
169 /* We want to duplicate the bits of the bottom nibble of Y */
170 unsigned expanded_y = bit_duplication[y & 0xF];
171
172 /* Iterate the row in source order. In the outer loop, we iterate 16
173 * bytes tiles. After each tile, we increment dest to include the size of
174 * that tile in pixels. */
175
176 for (; source < source_end; dest += PIXELS_PER_TILE) {
177 /* Within each tile, we iterate each of the 16 pixels in the row of
178 * the tile. This loop should be unrolled. */
179
180 for (int i = 0; i < 16; ++i) {
181 /* We have the X component spaced out in space_x and we have the Y
182 * component duplicated. So we just XOR them together. The X bits
183 * get the XOR like the pattern needs. The Y bits are XORing with
184 * zero so this is a no-op */
185
186 unsigned index = expanded_y ^ space_4[i];
187
188 /* Copy over the pixel */
189 dest[index] = *(source++);
190 }
191 }
192 }
193 }
194
195 static void
196 panfrost_access_tiled_image_generic(void *dst, void *src,
197 const struct pipe_box *box,
198 uint32_t dst_stride,
199 uint32_t src_stride,
200 uint32_t bpp,
201 bool is_store)
202 {
203 for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) {
204 int block_y = y & ~0x0f;
205 int block_start_s = block_y * dst_stride;
206 int source_start = src_y * src_stride;
207
208 unsigned expanded_y = bit_duplication[y & 0xF];
209
210 for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) {
211 int block_x_s = (x >> 4) * 256;
212
213 unsigned index = expanded_y ^ space_4[x & 0xF];
214
215 uint8_t *src8 = src;
216 uint8_t *source = &src8[source_start + bpp * src_x];
217 uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index);
218
219 uint8_t *out = is_store ? dest : source;
220 uint8_t *in = is_store ? source : dest;
221
222 uint16_t *out16 = (uint16_t *) out;
223 uint16_t *in16 = (uint16_t *) in;
224
225 uint32_t *out32 = (uint32_t *) out;
226 uint32_t *in32 = (uint32_t *) in;
227
228 uint64_t *out64 = (uint64_t *) out;
229 uint64_t *in64 = (uint64_t *) in;
230
231 /* Write out 1-16 bytes. Written like this rather than a loop so the
232 * compiler can see what's going on */
233
234 switch (bpp) {
235 case 1:
236 out[0] = in[0];
237 break;
238
239 case 2:
240 out16[0] = in16[0];
241 break;
242
243 case 3:
244 out16[0] = in16[0];
245 out[2] = in[2];
246 break;
247
248 case 4:
249 out32[0] = in32[0];
250 break;
251
252 case 6:
253 out32[0] = in32[0];
254 out16[2] = in16[2];
255 break;
256
257 case 8:
258 out64[0] = in64[0];
259 break;
260
261 case 12:
262 out64[0] = in64[0];
263 out32[2] = in32[2];
264 break;
265
266 case 16:
267 out64[0] = in64[0];
268 out64[1] = in64[1];
269 break;
270
271 default:
272 unreachable("Invalid bpp in software tiling");
273 }
274 }
275 }
276 }
277
278 void
279 panfrost_store_tiled_image(void *dst, const void *src,
280 const struct pipe_box *box,
281 uint32_t dst_stride,
282 uint32_t src_stride,
283 uint32_t bpp)
284 {
285 /* The optimized path is for aligned writes specifically */
286
287 if (box->x & 0xF || box->width & 0xF) {
288 panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE);
289 return;
290 }
291
292 /* Attempt to use an optimized path if we have one */
293
294 switch (bpp) {
295 case 4:
296 panfrost_store_tiled_image_bpp4(dst, (void *) src, box, dst_stride, src_stride);
297 break;
298 default:
299 panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE);
300 break;
301 }
302 }
303
304 void
305 panfrost_load_tiled_image(void *dst, const void *src,
306 const struct pipe_box *box,
307 uint32_t dst_stride,
308 uint32_t src_stride,
309 uint32_t bpp)
310 {
311 panfrost_access_tiled_image_generic((void *) src, dst, box, src_stride, dst_stride, bpp, FALSE);
312 }