2 * Copyright © 2017 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file v3d_cpu_tiling.h
26 * Contains load/store functions common to both v3d and vc4. The utile layout
27 * stayed the same, though the way utiles get laid out has changed.
31 v3d_load_utile(void *cpu
, uint32_t cpu_stride
,
32 void *gpu
, uint32_t gpu_stride
)
34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35 if (gpu_stride
== 8) {
37 /* Load from the GPU in one shot, no interleave, to
40 "vldm %[gpu], {q0, q1, q2, q3}\n"
41 /* Store each 8-byte line to cpu-side destination,
42 * incrementing it by the stride each time.
44 "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45 "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46 "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47 "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48 "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49 "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50 "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51 "vst1.8 d7, [%[cpu]]\n"
55 [cpu_stride
] "r"(cpu_stride
)
56 : "q0", "q1", "q2", "q3");
58 } else if (gpu_stride
== 16) {
60 /* Load from the GPU in one shot, no interleave, to
63 "vldm %[gpu], {q0, q1, q2, q3};\n"
64 /* Store each 16-byte line in 2 parts to the cpu-side
65 * destination. (vld1 can only store one d-register
68 "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69 "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70 "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71 "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72 "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73 "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74 "vst1.8 d6, [%[cpu]]\n"
75 "vst1.8 d7, [%[cpu2]]\n"
80 [cpu_stride
] "r"(cpu_stride
)
81 : "q0", "q1", "q2", "q3");
84 #elif defined (PIPE_ARCH_AARCH64)
85 if (gpu_stride
== 8) {
87 /* Load from the GPU in one shot, no interleave, to
90 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
91 /* Store each 8-byte line to cpu-side destination,
92 * incrementing it by the stride each time.
94 "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
95 "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
96 "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
97 "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
98 "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
99 "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
100 "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
101 "st1 {v3.D}[1], [%[cpu]]\n"
105 [cpu_stride
] "r"(cpu_stride
)
106 : "v0", "v1", "v2", "v3");
108 } else if (gpu_stride
== 16) {
110 /* Load from the GPU in one shot, no interleave, to
113 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
114 /* Store each 16-byte line in 2 parts to the cpu-side
115 * destination. (vld1 can only store one d-register
118 "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
119 "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
120 "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
121 "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
122 "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
123 "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
124 "st1 {v3.D}[0], [%[cpu]]\n"
125 "st1 {v3.D}[1], [%[cpu2]]\n"
130 [cpu_stride
] "r"(cpu_stride
)
131 : "v0", "v1", "v2", "v3");
136 for (uint32_t gpu_offset
= 0; gpu_offset
< 64; gpu_offset
+= gpu_stride
) {
137 memcpy(cpu
, gpu
+ gpu_offset
, gpu_stride
);
143 v3d_store_utile(void *gpu
, uint32_t gpu_stride
,
144 void *cpu
, uint32_t cpu_stride
)
146 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
147 if (gpu_stride
== 8) {
149 /* Load each 8-byte line from cpu-side source,
150 * incrementing it by the stride each time.
152 "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
153 "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
154 "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
155 "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
156 "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
157 "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
158 "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
159 "vld1.8 d7, [%[cpu]]\n"
160 /* Load from the GPU in one shot, no interleave, to
163 "vstm %[gpu], {q0, q1, q2, q3}\n"
167 [cpu_stride
] "r"(cpu_stride
)
168 : "q0", "q1", "q2", "q3");
170 } else if (gpu_stride
== 16) {
172 /* Load each 16-byte line in 2 parts from the cpu-side
173 * destination. (vld1 can only store one d-register
176 "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
177 "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
178 "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
179 "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
180 "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
181 "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
182 "vld1.8 d6, [%[cpu]]\n"
183 "vld1.8 d7, [%[cpu2]]\n"
184 /* Store to the GPU in one shot, no interleave. */
185 "vstm %[gpu], {q0, q1, q2, q3}\n"
190 [cpu_stride
] "r"(cpu_stride
)
191 : "q0", "q1", "q2", "q3");
194 #elif defined (PIPE_ARCH_AARCH64)
195 if (gpu_stride
== 8) {
197 /* Load each 8-byte line from cpu-side source,
198 * incrementing it by the stride each time.
200 "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
201 "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
202 "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
203 "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
204 "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
205 "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
206 "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
207 "ld1 {v3.D}[1], [%[cpu]]\n"
208 /* Store to the GPU in one shot, no interleave. */
209 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
213 [cpu_stride
] "r"(cpu_stride
)
214 : "v0", "v1", "v2", "v3");
216 } else if (gpu_stride
== 16) {
218 /* Load each 16-byte line in 2 parts from the cpu-side
219 * destination. (vld1 can only store one d-register
222 "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
223 "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
224 "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
225 "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
226 "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
227 "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
228 "ld1 {v3.D}[0], [%[cpu]]\n"
229 "ld1 {v3.D}[1], [%[cpu2]]\n"
230 /* Store to the GPU in one shot, no interleave. */
231 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
236 [cpu_stride
] "r"(cpu_stride
)
237 : "v0", "v1", "v2", "v3");
242 for (uint32_t gpu_offset
= 0; gpu_offset
< 64; gpu_offset
+= gpu_stride
) {
243 memcpy(gpu
+ gpu_offset
, cpu
, gpu_stride
);