v3d: Load and store aligned utiles all at once.
[mesa.git] / src / gallium / drivers / v3d / v3d_tiling.c
1 /*
2 * Copyright © 2014-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file v3d_tiling.c
25 *
26 * Handles information about the VC5 tiling formats, and loading and storing
27 * from them.
28 */
29
30 #include <stdint.h>
31 #include "v3d_screen.h"
32 #include "v3d_context.h"
33 #include "v3d_tiling.h"
34 #include "broadcom/common/v3d_cpu_tiling.h"
35
36 /** Return the width in pixels of a 64-byte microtile. */
37 uint32_t
38 v3d_utile_width(int cpp)
39 {
40 switch (cpp) {
41 case 1:
42 case 2:
43 return 8;
44 case 4:
45 case 8:
46 return 4;
47 case 16:
48 return 2;
49 default:
50 unreachable("unknown cpp");
51 }
52 }
53
54 /** Return the height in pixels of a 64-byte microtile. */
55 uint32_t
56 v3d_utile_height(int cpp)
57 {
58 switch (cpp) {
59 case 1:
60 return 8;
61 case 2:
62 case 4:
63 return 4;
64 case 8:
65 case 16:
66 return 2;
67 default:
68 unreachable("unknown cpp");
69 }
70 }
71
72 /**
73 * Returns the byte address for a given pixel within a utile.
74 *
75 * Utiles are 64b blocks of pixels in raster order, with 32bpp being a 4x4
76 * arrangement.
77 */
78 static inline uint32_t
79 v3d_get_utile_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y)
80 {
81 uint32_t utile_w = v3d_utile_width(cpp);
82 uint32_t utile_h = v3d_utile_height(cpp);
83
84 assert(x < utile_w && y < utile_h);
85
86 return x * cpp + y * utile_w * cpp;
87 }
88
89 /**
90 * Returns the byte offset for a given pixel in a LINEARTILE layout.
91 *
92 * LINEARTILE is a single line of utiles in either the X or Y direction.
93 */
94 static inline uint32_t
95 v3d_get_lt_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y)
96 {
97 uint32_t utile_w = v3d_utile_width(cpp);
98 uint32_t utile_h = v3d_utile_height(cpp);
99 uint32_t utile_index_x = x / utile_w;
100 uint32_t utile_index_y = y / utile_h;
101
102 assert(utile_index_x == 0 || utile_index_y == 0);
103
104 return (64 * (utile_index_x + utile_index_y) +
105 v3d_get_utile_pixel_offset(cpp,
106 x & (utile_w - 1),
107 y & (utile_h - 1)));
108 }
109
110 /**
111 * Returns the byte offset for a given pixel in a UBLINEAR layout.
112 *
113 * UBLINEAR is the layout where pixels are arranged in UIF blocks (2x2
114 * utiles), and the UIF blocks are in 1 or 2 columns in raster order.
115 */
116 static inline uint32_t
117 v3d_get_ublinear_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y,
118 int ublinear_number)
119 {
120 uint32_t utile_w = v3d_utile_width(cpp);
121 uint32_t utile_h = v3d_utile_height(cpp);
122 uint32_t ub_w = utile_w * 2;
123 uint32_t ub_h = utile_h * 2;
124 uint32_t ub_x = x / ub_w;
125 uint32_t ub_y = y / ub_h;
126
127 return (256 * (ub_y * ublinear_number +
128 ub_x) +
129 ((x & utile_w) ? 64 : 0) +
130 ((y & utile_h) ? 128 : 0) +
131 + v3d_get_utile_pixel_offset(cpp,
132 x & (utile_w - 1),
133 y & (utile_h - 1)));
134 }
135
136 static inline uint32_t
137 v3d_get_ublinear_2_column_pixel_offset(uint32_t cpp, uint32_t image_h,
138 uint32_t x, uint32_t y)
139 {
140 return v3d_get_ublinear_pixel_offset(cpp, x, y, 2);
141 }
142
143 static inline uint32_t
144 v3d_get_ublinear_1_column_pixel_offset(uint32_t cpp, uint32_t image_h,
145 uint32_t x, uint32_t y)
146 {
147 return v3d_get_ublinear_pixel_offset(cpp, x, y, 1);
148 }
149
150 /**
151 * Returns the byte offset for a given pixel in a UIF layout.
152 *
153 * UIF is the general VC5 tiling layout shared across 3D, media, and scanout.
154 * It stores pixels in UIF blocks (2x2 utiles), and UIF blocks are stored in
155 * 4x4 groups, and those 4x4 groups are then stored in raster order.
156 */
157 static inline uint32_t
158 v3d_get_uif_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y,
159 bool do_xor)
160 {
161 uint32_t utile_w = v3d_utile_width(cpp);
162 uint32_t utile_h = v3d_utile_height(cpp);
163 uint32_t mb_width = utile_w * 2;
164 uint32_t mb_height = utile_h * 2;
165 uint32_t log2_mb_width = ffs(mb_width) - 1;
166 uint32_t log2_mb_height = ffs(mb_height) - 1;
167
168 /* Macroblock X, y */
169 uint32_t mb_x = x >> log2_mb_width;
170 uint32_t mb_y = y >> log2_mb_height;
171 /* X, y within the macroblock */
172 uint32_t mb_pixel_x = x - (mb_x << log2_mb_width);
173 uint32_t mb_pixel_y = y - (mb_y << log2_mb_height);
174
175 if (do_xor && (mb_x / 4) & 1)
176 mb_y ^= 0x10;
177
178 uint32_t mb_h = align(image_h, 1 << log2_mb_height) >> log2_mb_height;
179 uint32_t mb_id = ((mb_x / 4) * ((mb_h - 1) * 4)) + mb_x + mb_y * 4;
180
181 uint32_t mb_base_addr = mb_id * 256;
182
183 bool top = mb_pixel_y < utile_h;
184 bool left = mb_pixel_x < utile_w;
185
186 /* Docs have this in pixels, we do bytes here. */
187 uint32_t mb_tile_offset = (!top * 128 + !left * 64);
188
189 uint32_t utile_x = mb_pixel_x & (utile_w - 1);
190 uint32_t utile_y = mb_pixel_y & (utile_h - 1);
191
192 uint32_t mb_pixel_address = (mb_base_addr +
193 mb_tile_offset +
194 v3d_get_utile_pixel_offset(cpp,
195 utile_x,
196 utile_y));
197
198 return mb_pixel_address;
199 }
200
201 static inline uint32_t
202 v3d_get_uif_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
203 uint32_t x, uint32_t y)
204 {
205 return v3d_get_uif_pixel_offset(cpp, image_h, x, y, true);
206 }
207
208 static inline uint32_t
209 v3d_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
210 uint32_t x, uint32_t y)
211 {
212 return v3d_get_uif_pixel_offset(cpp, image_h, x, y, false);
213 }
214
215 /* Loads/stores non-utile-aligned boxes by walking over the destination
216 * rectangle, computing the address on the GPU, and storing/loading a pixel at
217 * a time.
218 */
219 static inline void
220 v3d_move_pixels_unaligned(void *gpu, uint32_t gpu_stride,
221 void *cpu, uint32_t cpu_stride,
222 int cpp, uint32_t image_h,
223 const struct pipe_box *box,
224 uint32_t (*get_pixel_offset)(uint32_t cpp,
225 uint32_t image_h,
226 uint32_t x, uint32_t y),
227 bool is_load)
228 {
229 for (uint32_t y = 0; y < box->height; y++) {
230 void *cpu_row = cpu + y * cpu_stride;
231
232 for (int x = 0; x < box->width; x++) {
233 uint32_t pixel_offset = get_pixel_offset(cpp, image_h,
234 box->x + x,
235 box->y + y);
236
237 if (false) {
238 fprintf(stderr, "%3d,%3d -> %d\n",
239 box->x + x, box->y + y,
240 pixel_offset);
241 }
242
243 if (is_load) {
244 memcpy(cpu_row + x * cpp,
245 gpu + pixel_offset,
246 cpp);
247 } else {
248 memcpy(gpu + pixel_offset,
249 cpu_row + x * cpp,
250 cpp);
251 }
252 }
253 }
254 }
255
256 /* Breaks the image down into utiles and calls either the fast whole-utile
257 * load/store functions, or the unaligned fallback case.
258 */
259 static inline void
260 v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride,
261 void *cpu, uint32_t cpu_stride,
262 int cpp, uint32_t image_h,
263 const struct pipe_box *box,
264 uint32_t (*get_pixel_offset)(uint32_t cpp,
265 uint32_t image_h,
266 uint32_t x, uint32_t y),
267 bool is_load)
268 {
269 uint32_t utile_w = v3d_utile_width(cpp);
270 uint32_t utile_h = v3d_utile_height(cpp);
271 uint32_t utile_gpu_stride = utile_w * cpp;
272 uint32_t x1 = box->x;
273 uint32_t y1 = box->y;
274 uint32_t x2 = box->x + box->width;
275 uint32_t y2 = box->y + box->height;
276 uint32_t align_x1 = align(x1, utile_w);
277 uint32_t align_y1 = align(y1, utile_h);
278 uint32_t align_x2 = x2 & ~(utile_w - 1);
279 uint32_t align_y2 = y2 & ~(utile_h - 1);
280
281 /* Load/store all the whole utiles first. */
282 for (uint32_t y = align_y1; y < align_y2; y += utile_h) {
283 void *cpu_row = cpu + (y - box->y) * cpu_stride;
284
285 for (uint32_t x = align_x1; x < align_x2; x += utile_w) {
286 void *utile_gpu = (gpu +
287 get_pixel_offset(cpp, image_h, x, y));
288 void *utile_cpu = cpu_row + (x - box->x) * cpp;
289
290 if (is_load) {
291 v3d_load_utile(utile_cpu, cpu_stride,
292 utile_gpu, utile_gpu_stride);
293 } else {
294 v3d_store_utile(utile_gpu, utile_gpu_stride,
295 utile_cpu, cpu_stride);
296 }
297 }
298 }
299
300 /* If there were no aligned utiles in the middle, load/store the whole
301 * thing unaligned.
302 */
303 if (align_y2 <= align_y1 ||
304 align_x2 <= align_x1) {
305 v3d_move_pixels_unaligned(gpu, gpu_stride,
306 cpu, cpu_stride,
307 cpp, image_h,
308 box,
309 get_pixel_offset, is_load);
310 return;
311 }
312
313 /* Load/store the partial utiles. */
314 struct pipe_box partial_boxes[4] = {
315 /* Top */
316 {
317 .x = x1,
318 .width = x2 - x1,
319 .y = y1,
320 .height = align_y1 - y1,
321 },
322 /* Bottom */
323 {
324 .x = x1,
325 .width = x2 - x1,
326 .y = align_y2,
327 .height = y2 - align_y2,
328 },
329 /* Left */
330 {
331 .x = x1,
332 .width = align_x1 - x1,
333 .y = align_y1,
334 .height = align_y2 - align_y1,
335 },
336 /* Right */
337 {
338 .x = align_x2,
339 .width = x2 - align_x2,
340 .y = align_y1,
341 .height = align_y2 - align_y1,
342 },
343 };
344 for (int i = 0; i < ARRAY_SIZE(partial_boxes); i++) {
345 void *partial_cpu = (cpu +
346 (partial_boxes[i].y - y1) * cpu_stride +
347 (partial_boxes[i].x - x1) * cpp);
348
349 v3d_move_pixels_unaligned(gpu, gpu_stride,
350 partial_cpu, cpu_stride,
351 cpp, image_h,
352 &partial_boxes[i],
353 get_pixel_offset, is_load);
354 }
355 }
356
357 static inline void
358 v3d_move_pixels_general(void *gpu, uint32_t gpu_stride,
359 void *cpu, uint32_t cpu_stride,
360 int cpp, uint32_t image_h,
361 const struct pipe_box *box,
362 uint32_t (*get_pixel_offset)(uint32_t cpp,
363 uint32_t image_h,
364 uint32_t x, uint32_t y),
365 bool is_load)
366 {
367 switch (cpp) {
368 case 1:
369 v3d_move_pixels_general_percpp(gpu, gpu_stride,
370 cpu, cpu_stride,
371 1, image_h, box,
372 get_pixel_offset,
373 is_load);
374 break;
375 case 2:
376 v3d_move_pixels_general_percpp(gpu, gpu_stride,
377 cpu, cpu_stride,
378 2, image_h, box,
379 get_pixel_offset,
380 is_load);
381 break;
382 case 4:
383 v3d_move_pixels_general_percpp(gpu, gpu_stride,
384 cpu, cpu_stride,
385 4, image_h, box,
386 get_pixel_offset,
387 is_load);
388 break;
389 case 8:
390 v3d_move_pixels_general_percpp(gpu, gpu_stride,
391 cpu, cpu_stride,
392 8, image_h, box,
393 get_pixel_offset,
394 is_load);
395 break;
396 case 16:
397 v3d_move_pixels_general_percpp(gpu, gpu_stride,
398 cpu, cpu_stride,
399 16, image_h, box,
400 get_pixel_offset,
401 is_load);
402 break;
403 }
404 }
405
406 static inline void
407 v3d_move_tiled_image(void *gpu, uint32_t gpu_stride,
408 void *cpu, uint32_t cpu_stride,
409 enum v3d_tiling_mode tiling_format,
410 int cpp,
411 uint32_t image_h,
412 const struct pipe_box *box,
413 bool is_load)
414 {
415 switch (tiling_format) {
416 case VC5_TILING_UIF_XOR:
417 v3d_move_pixels_general(gpu, gpu_stride,
418 cpu, cpu_stride,
419 cpp, image_h, box,
420 v3d_get_uif_xor_pixel_offset,
421 is_load);
422 break;
423 case VC5_TILING_UIF_NO_XOR:
424 v3d_move_pixels_general(gpu, gpu_stride,
425 cpu, cpu_stride,
426 cpp, image_h, box,
427 v3d_get_uif_no_xor_pixel_offset,
428 is_load);
429 break;
430 case VC5_TILING_UBLINEAR_2_COLUMN:
431 v3d_move_pixels_general(gpu, gpu_stride,
432 cpu, cpu_stride,
433 cpp, image_h, box,
434 v3d_get_ublinear_2_column_pixel_offset,
435 is_load);
436 break;
437 case VC5_TILING_UBLINEAR_1_COLUMN:
438 v3d_move_pixels_general(gpu, gpu_stride,
439 cpu, cpu_stride,
440 cpp, image_h, box,
441 v3d_get_ublinear_1_column_pixel_offset,
442 is_load);
443 break;
444 case VC5_TILING_LINEARTILE:
445 v3d_move_pixels_general(gpu, gpu_stride,
446 cpu, cpu_stride,
447 cpp, image_h, box,
448 v3d_get_lt_pixel_offset,
449 is_load);
450 break;
451 default:
452 unreachable("Unsupported tiling format");
453 break;
454 }
455 }
456
457 /**
458 * Loads pixel data from the start (microtile-aligned) box in \p src to the
459 * start of \p dst according to the given tiling format.
460 */
461 void
462 v3d_load_tiled_image(void *dst, uint32_t dst_stride,
463 void *src, uint32_t src_stride,
464 enum v3d_tiling_mode tiling_format, int cpp,
465 uint32_t image_h,
466 const struct pipe_box *box)
467 {
468 v3d_move_tiled_image(src, src_stride,
469 dst, dst_stride,
470 tiling_format,
471 cpp,
472 image_h,
473 box,
474 true);
475 }
476
477 /**
478 * Stores pixel data from the start of \p src into a (microtile-aligned) box in
479 * \p dst according to the given tiling format.
480 */
481 void
482 v3d_store_tiled_image(void *dst, uint32_t dst_stride,
483 void *src, uint32_t src_stride,
484 enum v3d_tiling_mode tiling_format, int cpp,
485 uint32_t image_h,
486 const struct pipe_box *box)
487 {
488 v3d_move_tiled_image(dst, dst_stride,
489 src, src_stride,
490 tiling_format,
491 cpp,
492 image_h,
493 box,
494 false);
495 }