v3d: fix flushing of SSBOs and shader images
[mesa.git] / src / gallium / drivers / v3d / v3d_tiling.c
1 /*
2 * Copyright © 2014-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file v3d_tiling.c
25 *
26 * Handles information about the VC5 tiling formats, and loading and storing
27 * from them.
28 */
29
30 #include <stdint.h>
31 #include "v3d_screen.h"
32 #include "v3d_context.h"
33 #include "v3d_tiling.h"
34 #include "broadcom/common/v3d_cpu_tiling.h"
35
36 /** Return the width in pixels of a 64-byte microtile. */
37 uint32_t
38 v3d_utile_width(int cpp)
39 {
40 switch (cpp) {
41 case 1:
42 case 2:
43 return 8;
44 case 4:
45 case 8:
46 return 4;
47 case 16:
48 return 2;
49 default:
50 unreachable("unknown cpp");
51 }
52 }
53
54 /** Return the height in pixels of a 64-byte microtile. */
55 uint32_t
56 v3d_utile_height(int cpp)
57 {
58 switch (cpp) {
59 case 1:
60 return 8;
61 case 2:
62 case 4:
63 return 4;
64 case 8:
65 case 16:
66 return 2;
67 default:
68 unreachable("unknown cpp");
69 }
70 }
71
72 /**
73 * Returns the byte address for a given pixel within a utile.
74 *
75 * Utiles are 64b blocks of pixels in raster order, with 32bpp being a 4x4
76 * arrangement.
77 */
78 static inline uint32_t
79 v3d_get_utile_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y)
80 {
81 uint32_t utile_w = v3d_utile_width(cpp);
82
83 assert(x < utile_w && y < v3d_utile_height(cpp));
84
85 return x * cpp + y * utile_w * cpp;
86 }
87
88 /**
89 * Returns the byte offset for a given pixel in a LINEARTILE layout.
90 *
91 * LINEARTILE is a single line of utiles in either the X or Y direction.
92 */
93 static inline uint32_t
94 v3d_get_lt_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y)
95 {
96 uint32_t utile_w = v3d_utile_width(cpp);
97 uint32_t utile_h = v3d_utile_height(cpp);
98 uint32_t utile_index_x = x / utile_w;
99 uint32_t utile_index_y = y / utile_h;
100
101 assert(utile_index_x == 0 || utile_index_y == 0);
102
103 return (64 * (utile_index_x + utile_index_y) +
104 v3d_get_utile_pixel_offset(cpp,
105 x & (utile_w - 1),
106 y & (utile_h - 1)));
107 }
108
109 /**
110 * Returns the byte offset for a given pixel in a UBLINEAR layout.
111 *
112 * UBLINEAR is the layout where pixels are arranged in UIF blocks (2x2
113 * utiles), and the UIF blocks are in 1 or 2 columns in raster order.
114 */
115 static inline uint32_t
116 v3d_get_ublinear_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y,
117 int ublinear_number)
118 {
119 uint32_t utile_w = v3d_utile_width(cpp);
120 uint32_t utile_h = v3d_utile_height(cpp);
121 uint32_t ub_w = utile_w * 2;
122 uint32_t ub_h = utile_h * 2;
123 uint32_t ub_x = x / ub_w;
124 uint32_t ub_y = y / ub_h;
125
126 return (256 * (ub_y * ublinear_number +
127 ub_x) +
128 ((x & utile_w) ? 64 : 0) +
129 ((y & utile_h) ? 128 : 0) +
130 + v3d_get_utile_pixel_offset(cpp,
131 x & (utile_w - 1),
132 y & (utile_h - 1)));
133 }
134
135 static inline uint32_t
136 v3d_get_ublinear_2_column_pixel_offset(uint32_t cpp, uint32_t image_h,
137 uint32_t x, uint32_t y)
138 {
139 return v3d_get_ublinear_pixel_offset(cpp, x, y, 2);
140 }
141
142 static inline uint32_t
143 v3d_get_ublinear_1_column_pixel_offset(uint32_t cpp, uint32_t image_h,
144 uint32_t x, uint32_t y)
145 {
146 return v3d_get_ublinear_pixel_offset(cpp, x, y, 1);
147 }
148
149 /**
150 * Returns the byte offset for a given pixel in a UIF layout.
151 *
152 * UIF is the general VC5 tiling layout shared across 3D, media, and scanout.
153 * It stores pixels in UIF blocks (2x2 utiles), and UIF blocks are stored in
154 * 4x4 groups, and those 4x4 groups are then stored in raster order.
155 */
156 static inline uint32_t
157 v3d_get_uif_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y,
158 bool do_xor)
159 {
160 uint32_t utile_w = v3d_utile_width(cpp);
161 uint32_t utile_h = v3d_utile_height(cpp);
162 uint32_t mb_width = utile_w * 2;
163 uint32_t mb_height = utile_h * 2;
164 uint32_t log2_mb_width = ffs(mb_width) - 1;
165 uint32_t log2_mb_height = ffs(mb_height) - 1;
166
167 /* Macroblock X, y */
168 uint32_t mb_x = x >> log2_mb_width;
169 uint32_t mb_y = y >> log2_mb_height;
170 /* X, y within the macroblock */
171 uint32_t mb_pixel_x = x - (mb_x << log2_mb_width);
172 uint32_t mb_pixel_y = y - (mb_y << log2_mb_height);
173
174 if (do_xor && (mb_x / 4) & 1)
175 mb_y ^= 0x10;
176
177 uint32_t mb_h = align(image_h, 1 << log2_mb_height) >> log2_mb_height;
178 uint32_t mb_id = ((mb_x / 4) * ((mb_h - 1) * 4)) + mb_x + mb_y * 4;
179
180 uint32_t mb_base_addr = mb_id * 256;
181
182 bool top = mb_pixel_y < utile_h;
183 bool left = mb_pixel_x < utile_w;
184
185 /* Docs have this in pixels, we do bytes here. */
186 uint32_t mb_tile_offset = (!top * 128 + !left * 64);
187
188 uint32_t utile_x = mb_pixel_x & (utile_w - 1);
189 uint32_t utile_y = mb_pixel_y & (utile_h - 1);
190
191 uint32_t mb_pixel_address = (mb_base_addr +
192 mb_tile_offset +
193 v3d_get_utile_pixel_offset(cpp,
194 utile_x,
195 utile_y));
196
197 return mb_pixel_address;
198 }
199
200 static inline uint32_t
201 v3d_get_uif_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
202 uint32_t x, uint32_t y)
203 {
204 return v3d_get_uif_pixel_offset(cpp, image_h, x, y, true);
205 }
206
207 static inline uint32_t
208 v3d_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
209 uint32_t x, uint32_t y)
210 {
211 return v3d_get_uif_pixel_offset(cpp, image_h, x, y, false);
212 }
213
214 /* Loads/stores non-utile-aligned boxes by walking over the destination
215 * rectangle, computing the address on the GPU, and storing/loading a pixel at
216 * a time.
217 */
218 static inline void
219 v3d_move_pixels_unaligned(void *gpu, uint32_t gpu_stride,
220 void *cpu, uint32_t cpu_stride,
221 int cpp, uint32_t image_h,
222 const struct pipe_box *box,
223 uint32_t (*get_pixel_offset)(uint32_t cpp,
224 uint32_t image_h,
225 uint32_t x, uint32_t y),
226 bool is_load)
227 {
228 for (uint32_t y = 0; y < box->height; y++) {
229 void *cpu_row = cpu + y * cpu_stride;
230
231 for (int x = 0; x < box->width; x++) {
232 uint32_t pixel_offset = get_pixel_offset(cpp, image_h,
233 box->x + x,
234 box->y + y);
235
236 if (false) {
237 fprintf(stderr, "%3d,%3d -> %d\n",
238 box->x + x, box->y + y,
239 pixel_offset);
240 }
241
242 if (is_load) {
243 memcpy(cpu_row + x * cpp,
244 gpu + pixel_offset,
245 cpp);
246 } else {
247 memcpy(gpu + pixel_offset,
248 cpu_row + x * cpp,
249 cpp);
250 }
251 }
252 }
253 }
254
255 /* Breaks the image down into utiles and calls either the fast whole-utile
256 * load/store functions, or the unaligned fallback case.
257 */
258 static inline void
259 v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride,
260 void *cpu, uint32_t cpu_stride,
261 int cpp, uint32_t image_h,
262 const struct pipe_box *box,
263 uint32_t (*get_pixel_offset)(uint32_t cpp,
264 uint32_t image_h,
265 uint32_t x, uint32_t y),
266 bool is_load)
267 {
268 uint32_t utile_w = v3d_utile_width(cpp);
269 uint32_t utile_h = v3d_utile_height(cpp);
270 uint32_t utile_gpu_stride = utile_w * cpp;
271 uint32_t x1 = box->x;
272 uint32_t y1 = box->y;
273 uint32_t x2 = box->x + box->width;
274 uint32_t y2 = box->y + box->height;
275 uint32_t align_x1 = align(x1, utile_w);
276 uint32_t align_y1 = align(y1, utile_h);
277 uint32_t align_x2 = x2 & ~(utile_w - 1);
278 uint32_t align_y2 = y2 & ~(utile_h - 1);
279
280 /* Load/store all the whole utiles first. */
281 for (uint32_t y = align_y1; y < align_y2; y += utile_h) {
282 void *cpu_row = cpu + (y - box->y) * cpu_stride;
283
284 for (uint32_t x = align_x1; x < align_x2; x += utile_w) {
285 void *utile_gpu = (gpu +
286 get_pixel_offset(cpp, image_h, x, y));
287 void *utile_cpu = cpu_row + (x - box->x) * cpp;
288
289 if (is_load) {
290 v3d_load_utile(utile_cpu, cpu_stride,
291 utile_gpu, utile_gpu_stride);
292 } else {
293 v3d_store_utile(utile_gpu, utile_gpu_stride,
294 utile_cpu, cpu_stride);
295 }
296 }
297 }
298
299 /* If there were no aligned utiles in the middle, load/store the whole
300 * thing unaligned.
301 */
302 if (align_y2 <= align_y1 ||
303 align_x2 <= align_x1) {
304 v3d_move_pixels_unaligned(gpu, gpu_stride,
305 cpu, cpu_stride,
306 cpp, image_h,
307 box,
308 get_pixel_offset, is_load);
309 return;
310 }
311
312 /* Load/store the partial utiles. */
313 struct pipe_box partial_boxes[4] = {
314 /* Top */
315 {
316 .x = x1,
317 .width = x2 - x1,
318 .y = y1,
319 .height = align_y1 - y1,
320 },
321 /* Bottom */
322 {
323 .x = x1,
324 .width = x2 - x1,
325 .y = align_y2,
326 .height = y2 - align_y2,
327 },
328 /* Left */
329 {
330 .x = x1,
331 .width = align_x1 - x1,
332 .y = align_y1,
333 .height = align_y2 - align_y1,
334 },
335 /* Right */
336 {
337 .x = align_x2,
338 .width = x2 - align_x2,
339 .y = align_y1,
340 .height = align_y2 - align_y1,
341 },
342 };
343 for (int i = 0; i < ARRAY_SIZE(partial_boxes); i++) {
344 void *partial_cpu = (cpu +
345 (partial_boxes[i].y - y1) * cpu_stride +
346 (partial_boxes[i].x - x1) * cpp);
347
348 v3d_move_pixels_unaligned(gpu, gpu_stride,
349 partial_cpu, cpu_stride,
350 cpp, image_h,
351 &partial_boxes[i],
352 get_pixel_offset, is_load);
353 }
354 }
355
356 static inline void
357 v3d_move_pixels_general(void *gpu, uint32_t gpu_stride,
358 void *cpu, uint32_t cpu_stride,
359 int cpp, uint32_t image_h,
360 const struct pipe_box *box,
361 uint32_t (*get_pixel_offset)(uint32_t cpp,
362 uint32_t image_h,
363 uint32_t x, uint32_t y),
364 bool is_load)
365 {
366 switch (cpp) {
367 case 1:
368 v3d_move_pixels_general_percpp(gpu, gpu_stride,
369 cpu, cpu_stride,
370 1, image_h, box,
371 get_pixel_offset,
372 is_load);
373 break;
374 case 2:
375 v3d_move_pixels_general_percpp(gpu, gpu_stride,
376 cpu, cpu_stride,
377 2, image_h, box,
378 get_pixel_offset,
379 is_load);
380 break;
381 case 4:
382 v3d_move_pixels_general_percpp(gpu, gpu_stride,
383 cpu, cpu_stride,
384 4, image_h, box,
385 get_pixel_offset,
386 is_load);
387 break;
388 case 8:
389 v3d_move_pixels_general_percpp(gpu, gpu_stride,
390 cpu, cpu_stride,
391 8, image_h, box,
392 get_pixel_offset,
393 is_load);
394 break;
395 case 16:
396 v3d_move_pixels_general_percpp(gpu, gpu_stride,
397 cpu, cpu_stride,
398 16, image_h, box,
399 get_pixel_offset,
400 is_load);
401 break;
402 }
403 }
404
405 static inline void
406 v3d_move_tiled_image(void *gpu, uint32_t gpu_stride,
407 void *cpu, uint32_t cpu_stride,
408 enum v3d_tiling_mode tiling_format,
409 int cpp,
410 uint32_t image_h,
411 const struct pipe_box *box,
412 bool is_load)
413 {
414 switch (tiling_format) {
415 case VC5_TILING_UIF_XOR:
416 v3d_move_pixels_general(gpu, gpu_stride,
417 cpu, cpu_stride,
418 cpp, image_h, box,
419 v3d_get_uif_xor_pixel_offset,
420 is_load);
421 break;
422 case VC5_TILING_UIF_NO_XOR:
423 v3d_move_pixels_general(gpu, gpu_stride,
424 cpu, cpu_stride,
425 cpp, image_h, box,
426 v3d_get_uif_no_xor_pixel_offset,
427 is_load);
428 break;
429 case VC5_TILING_UBLINEAR_2_COLUMN:
430 v3d_move_pixels_general(gpu, gpu_stride,
431 cpu, cpu_stride,
432 cpp, image_h, box,
433 v3d_get_ublinear_2_column_pixel_offset,
434 is_load);
435 break;
436 case VC5_TILING_UBLINEAR_1_COLUMN:
437 v3d_move_pixels_general(gpu, gpu_stride,
438 cpu, cpu_stride,
439 cpp, image_h, box,
440 v3d_get_ublinear_1_column_pixel_offset,
441 is_load);
442 break;
443 case VC5_TILING_LINEARTILE:
444 v3d_move_pixels_general(gpu, gpu_stride,
445 cpu, cpu_stride,
446 cpp, image_h, box,
447 v3d_get_lt_pixel_offset,
448 is_load);
449 break;
450 default:
451 unreachable("Unsupported tiling format");
452 break;
453 }
454 }
455
456 /**
457 * Loads pixel data from the start (microtile-aligned) box in \p src to the
458 * start of \p dst according to the given tiling format.
459 */
460 void
461 v3d_load_tiled_image(void *dst, uint32_t dst_stride,
462 void *src, uint32_t src_stride,
463 enum v3d_tiling_mode tiling_format, int cpp,
464 uint32_t image_h,
465 const struct pipe_box *box)
466 {
467 v3d_move_tiled_image(src, src_stride,
468 dst, dst_stride,
469 tiling_format,
470 cpp,
471 image_h,
472 box,
473 true);
474 }
475
476 /**
477 * Stores pixel data from the start of \p src into a (microtile-aligned) box in
478 * \p dst according to the given tiling format.
479 */
480 void
481 v3d_store_tiled_image(void *dst, uint32_t dst_stride,
482 void *src, uint32_t src_stride,
483 enum v3d_tiling_mode tiling_format, int cpp,
484 uint32_t image_h,
485 const struct pipe_box *box)
486 {
487 v3d_move_tiled_image(dst, dst_stride,
488 src, src_stride,
489 tiling_format,
490 cpp,
491 image_h,
492 box,
493 false);
494 }