2 * Copyright © 2017 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file vc4_tiling_lt.c
26 * Helper functions from vc4_tiling.c that will be compiled for using NEON
29 * If V3D_BUILD_NEON is set, then the functions will be suffixed with _neon.
30 * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
35 #include "pipe/p_state.h"
36 #include "vc4_tiling.h"
37 #include "broadcom/common/v3d_cpu_tiling.h"
40 #define NEON_TAG(x) x ## _neon
42 #define NEON_TAG(x) x ## _base
45 static inline uint32_t
46 align_down(uint32_t val
, uint32_t align
)
48 return val
& ~(align
- 1);
51 /** Returns the stride in bytes of a 64-byte microtile. */
53 vc4_utile_stride(int cpp
)
63 unreachable("bad cpp");
68 * Returns the X value into the address bits for LT tiling.
70 * The LT tile load/stores rely on the X bits not intersecting with the Y
71 * bits. Because of this, we have to choose to put the utile index within the
72 * LT tile into one of the two values, and we do so in swizzle_lt_x() to make
73 * NPOT handling easier.
76 swizzle_lt_x(int x
, int cpp
)
80 /* 8x8 inside of 4x4 */
81 return ((x
& 0x7) << (0 - 0) |
82 (x
& ~0x7) << (6 - 3));
84 /* 8x4 inside of 4x4 */
85 return ((x
& 0x7) << (1 - 0) |
86 (x
& ~0x7) << (6 - 3));
88 /* 4x4 inside of 4x4 */
89 return ((x
& 0x3) << (2 - 0) |
90 (x
& ~0x3) << (6 - 2));
92 /* 2x4 inside of 4x4 */
93 return ((x
& 0x1) << (3 - 0) |
94 (x
& ~0x1) << (6 - 1));
96 unreachable("bad cpp");
101 * Returns the Y value into the address bits for LT tiling.
103 * The LT tile load/stores rely on the X bits not intersecting with the Y
107 swizzle_lt_y(int y
, int cpp
)
112 /* 8x8 inside of 4x4 */
113 return ((y
& 0x7) << 3);
115 /* 8x4 inside of 4x4 */
116 return ((y
& 0x3) << 4);
118 /* 4x4 inside of 4x4 */
119 return ((y
& 0x3) << 4);
121 /* 2x4 inside of 4x4 */
122 return ((y
& 0x3) << 4);
124 unreachable("bad cpp");
129 * Helper for loading or storing to an LT image, where the box is aligned
132 * This just breaks the box down into calls to the fast
133 * vc4_load_utile/vc4_store_utile helpers.
136 vc4_lt_image_aligned(void *gpu
, uint32_t gpu_stride
,
137 void *cpu
, uint32_t cpu_stride
,
138 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
140 uint32_t utile_w
= vc4_utile_width(cpp
);
141 uint32_t utile_h
= vc4_utile_height(cpp
);
142 uint32_t utile_stride
= vc4_utile_stride(cpp
);
143 uint32_t xstart
= box
->x
;
144 uint32_t ystart
= box
->y
;
146 for (uint32_t y
= 0; y
< box
->height
; y
+= utile_h
) {
147 for (uint32_t x
= 0; x
< box
->width
; x
+= utile_w
) {
148 void *gpu_tile
= gpu
+ ((ystart
+ y
) * gpu_stride
+
149 (xstart
+ x
) * 64 / utile_w
);
151 v3d_load_utile(cpu
+ (cpu_stride
* y
+
157 v3d_store_utile(gpu_tile
,
159 cpu
+ (cpu_stride
* y
+
168 * Helper for loading or storing to an LT image, where the box is not aligned
171 * This walks through the raster-order data, copying to/from the corresponding
172 * tiled pixel. This means we don't get write-combining on stores, but the
173 * loop is very few CPU instructions since the memcpy will be inlined.
176 vc4_lt_image_unaligned(void *gpu
, uint32_t gpu_stride
,
177 void *cpu
, uint32_t cpu_stride
,
178 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
181 /* These are the address bits for the start of the box, split out into
182 * x/y so that they can be incremented separately in their loops.
184 uint32_t offs_x0
= swizzle_lt_x(box
->x
, cpp
);
185 uint32_t offs_y
= swizzle_lt_y(box
->y
, cpp
);
186 /* The *_mask values are "what bits of the address are from x or y" */
187 uint32_t x_mask
= swizzle_lt_x(~0, cpp
);
188 uint32_t y_mask
= swizzle_lt_y(~0, cpp
);
189 uint32_t incr_y
= swizzle_lt_x(gpu_stride
/ cpp
, cpp
);
191 assert(!(x_mask
& y_mask
));
193 offs_x0
+= incr_y
* (box
->y
/ vc4_utile_height(cpp
));
195 for (uint32_t y
= 0; y
< box
->height
; y
++) {
196 void *gpu_row
= gpu
+ offs_y
;
198 uint32_t offs_x
= offs_x0
;
200 for (uint32_t x
= 0; x
< box
->width
; x
++) {
201 /* Use a memcpy here to move a pixel's worth of data.
202 * We're relying on this function to be inlined, so
203 * this will get expanded into the appropriate 1, 2,
207 memcpy(cpu
+ x
* cpp
, gpu_row
+ offs_x
, cpp
);
209 memcpy(gpu_row
+ offs_x
, cpu
+ x
* cpp
, cpp
);
212 /* This math trick with x_mask increments offs_x by 1
215 offs_x
= (offs_x
- x_mask
) & x_mask
;
218 offs_y
= (offs_y
- y_mask
) & y_mask
;
219 /* When offs_y wraps (we hit the end of the utile), we
220 * increment offs_x0 by effectively the utile stride.
230 * General LT image load/store helper.
233 vc4_lt_image_helper(void *gpu
, uint32_t gpu_stride
,
234 void *cpu
, uint32_t cpu_stride
,
235 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
237 if (box
->x
& (vc4_utile_width(cpp
) - 1) ||
238 box
->y
& (vc4_utile_height(cpp
) - 1) ||
239 box
->width
& (vc4_utile_width(cpp
) - 1) ||
240 box
->height
& (vc4_utile_height(cpp
) - 1)) {
241 vc4_lt_image_unaligned(gpu
, gpu_stride
,
245 vc4_lt_image_aligned(gpu
, gpu_stride
,
252 vc4_lt_image_cpp_helper(void *gpu
, uint32_t gpu_stride
,
253 void *cpu
, uint32_t cpu_stride
,
254 int cpp
, const struct pipe_box
*box
, bool to_cpu
)
258 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 1, box
,
262 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 2, box
,
266 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 4, box
,
270 vc4_lt_image_helper(gpu
, gpu_stride
, cpu
, cpu_stride
, 8, box
,
274 unreachable("bad cpp");
279 NEON_TAG(vc4_load_lt_image
)(void *dst
, uint32_t dst_stride
,
280 void *src
, uint32_t src_stride
,
281 int cpp
, const struct pipe_box
*box
)
283 vc4_lt_image_cpp_helper(src
, src_stride
, dst
, dst_stride
, cpp
, box
,
288 NEON_TAG(vc4_store_lt_image
)(void *dst
, uint32_t dst_stride
,
289 void *src
, uint32_t src_stride
,
290 int cpp
, const struct pipe_box
*box
)
292 vc4_lt_image_cpp_helper(dst
, dst_stride
, src
, src_stride
, cpp
, box
,