ec42a3dc2f7f4f160336752e79f53e6ef8b0b6fa
2 * Copyright © 2017 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file vc4_tiling_lt.c
26 * Helper functions from vc4_tiling.c that will be compiled for using NEON
29 * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
30 * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
35 #include "pipe/p_state.h"
36 #include "vc4_tiling.h"
39 #define NEON_TAG(x) x ## _neon
41 #define NEON_TAG(x) x ## _base
44 static inline uint32_t
45 align_down(uint32_t val
, uint32_t align
)
47 return val
& ~(align
- 1);
50 /** Returns the stride in bytes of a 64-byte microtile. */
52 vc4_utile_stride(int cpp
)
62 unreachable("bad cpp");
67 vc4_load_utile(void *cpu
, void *gpu
, uint32_t cpu_stride
, uint32_t cpp
)
69 uint32_t gpu_stride
= vc4_utile_stride(cpp
);
70 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
71 if (gpu_stride
== 8) {
73 /* Load from the GPU in one shot, no interleave, to
76 "vldm %0, {q0, q1, q2, q3}\n"
77 /* Store each 8-byte line to cpu-side destination,
78 * incrementing it by the stride each time.
80 "vst1.8 d0, [%1], %2\n"
81 "vst1.8 d1, [%1], %2\n"
82 "vst1.8 d2, [%1], %2\n"
83 "vst1.8 d3, [%1], %2\n"
84 "vst1.8 d4, [%1], %2\n"
85 "vst1.8 d5, [%1], %2\n"
86 "vst1.8 d6, [%1], %2\n"
89 : "r"(gpu
), "r"(cpu
), "r"(cpu_stride
)
90 : "q0", "q1", "q2", "q3");
92 assert(gpu_stride
== 16);
94 /* Load from the GPU in one shot, no interleave, to
97 "vldm %0, {q0, q1, q2, q3};\n"
98 /* Store each 16-byte line in 2 parts to the cpu-side
99 * destination. (vld1 can only store one d-register
102 "vst1.8 d0, [%1], %3\n"
103 "vst1.8 d1, [%2], %3\n"
104 "vst1.8 d2, [%1], %3\n"
105 "vst1.8 d3, [%2], %3\n"
106 "vst1.8 d4, [%1], %3\n"
107 "vst1.8 d5, [%2], %3\n"
111 : "r"(gpu
), "r"(cpu
), "r"(cpu
+ 8), "r"(cpu_stride
)
112 : "q0", "q1", "q2", "q3");
114 #elif defined (PIPE_ARCH_AARCH64)
115 if (gpu_stride
== 8) {
117 /* Load from the GPU in one shot, no interleave, to
120 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
121 /* Store each 8-byte line to cpu-side destination,
122 * incrementing it by the stride each time.
124 "st1 {v0.D}[0], [%1], %2\n"
125 "st1 {v0.D}[1], [%1], %2\n"
126 "st1 {v1.D}[0], [%1], %2\n"
127 "st1 {v1.D}[1], [%1], %2\n"
128 "st1 {v2.D}[0], [%1], %2\n"
129 "st1 {v2.D}[1], [%1], %2\n"
130 "st1 {v3.D}[0], [%1], %2\n"
131 "st1 {v3.D}[1], [%1]\n"
133 : "r"(gpu
), "r"(cpu
), "r"(cpu_stride
)
134 : "v0", "v1", "v2", "v3");
136 assert(gpu_stride
== 16);
138 /* Load from the GPU in one shot, no interleave, to
141 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
142 /* Store each 16-byte line in 2 parts to the cpu-side
143 * destination. (vld1 can only store one d-register
146 "st1 {v0.D}[0], [%1], %3\n"
147 "st1 {v0.D}[1], [%2], %3\n"
148 "st1 {v1.D}[0], [%1], %3\n"
149 "st1 {v1.D}[1], [%2], %3\n"
150 "st1 {v2.D}[0], [%1], %3\n"
151 "st1 {v2.D}[1], [%2], %3\n"
152 "st1 {v3.D}[0], [%1]\n"
153 "st1 {v3.D}[1], [%2]\n"
155 : "r"(gpu
), "r"(cpu
), "r"(cpu
+ 8), "r"(cpu_stride
)
156 : "v0", "v1", "v2", "v3");
159 for (uint32_t gpu_offset
= 0; gpu_offset
< 64; gpu_offset
+= gpu_stride
) {
160 memcpy(cpu
, gpu
+ gpu_offset
, gpu_stride
);
167 vc4_store_utile(void *gpu
, void *cpu
, uint32_t cpu_stride
, uint32_t cpp
)
169 uint32_t gpu_stride
= vc4_utile_stride(cpp
);
171 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
172 if (gpu_stride
== 8) {
174 /* Load each 8-byte line from cpu-side source,
175 * incrementing it by the stride each time.
177 "vld1.8 d0, [%1], %2\n"
178 "vld1.8 d1, [%1], %2\n"
179 "vld1.8 d2, [%1], %2\n"
180 "vld1.8 d3, [%1], %2\n"
181 "vld1.8 d4, [%1], %2\n"
182 "vld1.8 d5, [%1], %2\n"
183 "vld1.8 d6, [%1], %2\n"
185 /* Load from the GPU in one shot, no interleave, to
188 "vstm %0, {q0, q1, q2, q3}\n"
190 : "r"(gpu
), "r"(cpu
), "r"(cpu_stride
)
191 : "q0", "q1", "q2", "q3");
193 assert(gpu_stride
== 16);
195 /* Load each 16-byte line in 2 parts from the cpu-side
196 * destination. (vld1 can only store one d-register
199 "vld1.8 d0, [%1], %3\n"
200 "vld1.8 d1, [%2], %3\n"
201 "vld1.8 d2, [%1], %3\n"
202 "vld1.8 d3, [%2], %3\n"
203 "vld1.8 d4, [%1], %3\n"
204 "vld1.8 d5, [%2], %3\n"
207 /* Store to the GPU in one shot, no interleave. */
208 "vstm %0, {q0, q1, q2, q3}\n"
210 : "r"(gpu
), "r"(cpu
), "r"(cpu
+ 8), "r"(cpu_stride
)
211 : "q0", "q1", "q2", "q3");
213 #elif defined (PIPE_ARCH_AARCH64)
214 if (gpu_stride
== 8) {
216 /* Load each 8-byte line from cpu-side source,
217 * incrementing it by the stride each time.
219 "ld1 {v0.D}[0], [%1], %2\n"
220 "ld1 {v0.D}[1], [%1], %2\n"
221 "ld1 {v1.D}[0], [%1], %2\n"
222 "ld1 {v1.D}[1], [%1], %2\n"
223 "ld1 {v2.D}[0], [%1], %2\n"
224 "ld1 {v2.D}[1], [%1], %2\n"
225 "ld1 {v3.D}[0], [%1], %2\n"
226 "ld1 {v3.D}[1], [%1]\n"
227 /* Store to the GPU in one shot, no interleave. */
228 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
230 : "r"(gpu
), "r"(cpu
), "r"(cpu_stride
)
231 : "v0", "v1", "v2", "v3");
233 assert(gpu_stride
== 16);
235 /* Load each 16-byte line in 2 parts from the cpu-side
236 * destination. (vld1 can only store one d-register
239 "ld1 {v0.D}[0], [%1], %3\n"
240 "ld1 {v0.D}[1], [%2], %3\n"
241 "ld1 {v1.D}[0], [%1], %3\n"
242 "ld1 {v1.D}[1], [%2], %3\n"
243 "ld1 {v2.D}[0], [%1], %3\n"
244 "ld1 {v2.D}[1], [%2], %3\n"
245 "ld1 {v3.D}[0], [%1]\n"
246 "ld1 {v3.D}[1], [%2]\n"
247 /* Store to the GPU in one shot, no interleave. */
248 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
250 : "r"(gpu
), "r"(cpu
), "r"(cpu
+ 8), "r"(cpu_stride
)
251 : "v0", "v1", "v2", "v3");
254 for (uint32_t gpu_offset
= 0; gpu_offset
< 64; gpu_offset
+= gpu_stride
) {
255 memcpy(gpu
+ gpu_offset
, cpu
, gpu_stride
);
262 * Returns the X value into the address bits for LT tiling.
264 * The LT tile load/stores rely on the X bits not intersecting with the Y
265 * bits. Because of this, we have to choose to put the utile index within the
266 * LT tile into one of the two values, and we do so in swizzle_lt_x() to make
267 * NPOT handling easier.
270 swizzle_lt_x(int x
, int cpp
)
274 /* 8x8 inside of 4x4 */
275 return ((x
& 0x7) << (0 - 0) |
276 (x
& ~0x7) << (6 - 3));
278 /* 8x4 inside of 4x4 */
279 return ((x
& 0x7) << (1 - 0) |
280 (x
& ~0x7) << (6 - 3));
282 /* 4x4 inside of 4x4 */
283 return ((x
& 0x3) << (2 - 0) |
284 (x
& ~0x3) << (6 - 2));
286 /* 2x4 inside of 4x4 */
287 return ((x
& 0x1) << (3 - 0) |
288 (x
& ~0x1) << (6 - 1));
290 unreachable("bad cpp");
295 * Returns the Y value into the address bits for LT tiling.
297 * The LT tile load/stores rely on the X bits not intersecting with the Y
301 swizzle_lt_y(int y
, int cpp
)
306 /* 8x8 inside of 4x4 */
307 return ((y
& 0x7) << 3);
309 /* 8x4 inside of 4x4 */
310 return ((y
& 0x3) << 4);
312 /* 4x4 inside of 4x4 */
313 return ((y
& 0x3) << 4);
315 /* 2x4 inside of 4x4 */
316 return ((y
& 0x3) << 4);
318 unreachable("bad cpp");
323 * Helper for loading or storing to an LT image, where the box is aligned
326 * This just breaks the box down into calls to the fast
327 * vc4_load_utile/vc4_store_utile helpers.
330 vc4_lt_image_aligned(void *gpu
, uint32_t gpu_stride
,
331 void *cpu
, uint32_t cpu_stride
,
332 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
334 uint32_t utile_w
= vc4_utile_width(cpp
);
335 uint32_t utile_h
= vc4_utile_height(cpp
);
336 uint32_t xstart
= box
->x
;
337 uint32_t ystart
= box
->y
;
339 for (uint32_t y
= 0; y
< box
->height
; y
+= utile_h
) {
340 for (uint32_t x
= 0; x
< box
->width
; x
+= utile_w
) {
341 void *gpu_tile
= gpu
+ ((ystart
+ y
) * gpu_stride
+
342 (xstart
+ x
) * 64 / utile_w
);
344 vc4_load_utile(cpu
+ (cpu_stride
* y
+
349 vc4_store_utile(gpu_tile
,
350 cpu
+ (cpu_stride
* y
+
359 * Helper for loading or storing to an LT image, where the box is not aligned
362 * This walks through the raster-order data, copying to/from the corresponding
363 * tiled pixel. This means we don't get write-combining on stores, but the
364 * loop is very few CPU instructions since the memcpy will be inlined.
367 vc4_lt_image_unaligned(void *gpu
, uint32_t gpu_stride
,
368 void *cpu
, uint32_t cpu_stride
,
369 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
372 /* These are the address bits for the start of the box, split out into
373 * x/y so that they can be incremented separately in their loops.
375 uint32_t offs_x0
= swizzle_lt_x(box
->x
, cpp
);
376 uint32_t offs_y
= swizzle_lt_y(box
->y
, cpp
);
377 /* The *_mask values are "what bits of the address are from x or y" */
378 uint32_t x_mask
= swizzle_lt_x(~0, cpp
);
379 uint32_t y_mask
= swizzle_lt_y(~0, cpp
);
380 uint32_t incr_y
= swizzle_lt_x(gpu_stride
/ cpp
, cpp
);
382 assert(!(x_mask
& y_mask
));
384 offs_x0
+= incr_y
* (box
->y
/ vc4_utile_height(cpp
));
386 for (uint32_t y
= 0; y
< box
->height
; y
++) {
387 void *gpu_row
= gpu
+ offs_y
;
389 uint32_t offs_x
= offs_x0
;
391 for (uint32_t x
= 0; x
< box
->width
; x
++) {
392 /* Use a memcpy here to move a pixel's worth of data.
393 * We're relying on this function to be inlined, so
394 * this will get expanded into the appropriate 1, 2,
398 memcpy(cpu
+ x
* cpp
, gpu_row
+ offs_x
, cpp
);
400 memcpy(gpu_row
+ offs_x
, cpu
+ x
* cpp
, cpp
);
403 /* This math trick with x_mask increments offs_x by 1
406 offs_x
= (offs_x
- x_mask
) & x_mask
;
409 offs_y
= (offs_y
- y_mask
) & y_mask
;
410 /* When offs_y wraps (we hit the end of the utile), we
411 * increment offs_x0 by effectively the utile stride.
421 * General LT image load/store helper.
424 vc4_lt_image_helper(void *gpu
, uint32_t gpu_stride
,
425 void *cpu
, uint32_t cpu_stride
,
426 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
428 if (box
->x
& (vc4_utile_width(cpp
) - 1) ||
429 box
->y
& (vc4_utile_height(cpp
) - 1) ||
430 box
->width
& (vc4_utile_width(cpp
) - 1) ||
431 box
->height
& (vc4_utile_height(cpp
) - 1)) {
432 vc4_lt_image_unaligned(gpu
, gpu_stride
,
436 vc4_lt_image_aligned(gpu
, gpu_stride
,
443 vc4_lt_image_cpp_helper(void *gpu
, uint32_t gpu_stride
,
444 void *cpu
, uint32_t cpu_stride
,
445 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
449 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 1, box
,
453 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 2, box
,
457 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 4, box
,
461 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 8, box
,
465 unreachable("bad cpp");
470 NEON_TAG(vc4_load_lt_image
)(void *dst
, uint32_t dst_stride
,
471 void *src
, uint32_t src_stride
,
472 int cpp
, const struct pipe_box
*box
)
474 vc4_lt_image_cpp_helper(src
, src_stride
, dst
, dst_stride
, cpp
, box
,
479 NEON_TAG(vc4_store_lt_image
)(void *dst
, uint32_t dst_stride
,
480 void *src
, uint32_t src_stride
,
481 int cpp
, const struct pipe_box
*box
)
483 vc4_lt_image_cpp_helper(dst
, dst_stride
, src
, src_stride
, cpp
, box
,