v3d: we always have at least one output segment
[mesa.git] / src / broadcom / common / v3d_cpu_tiling.h
1 /*
2 * Copyright © 2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file v3d_cpu_tiling.h
25 *
26 * Contains load/store functions common to both v3d and vc4. The utile layout
27 * stayed the same, though the way utiles get laid out has changed.
28 */
29
30 static inline void
31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
32 void *gpu, uint32_t gpu_stride)
33 {
34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35 if (gpu_stride == 8) {
36 __asm__ volatile (
37 /* Load from the GPU in one shot, no interleave, to
38 * d0-d7.
39 */
40 "vldm %[gpu], {q0, q1, q2, q3}\n"
41 /* Store each 8-byte line to cpu-side destination,
42 * incrementing it by the stride each time.
43 */
44 "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45 "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46 "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47 "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48 "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49 "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50 "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51 "vst1.8 d7, [%[cpu]]\n"
52 : [cpu] "+r"(cpu)
53 : [gpu] "r"(gpu),
54 [cpu_stride] "r"(cpu_stride)
55 : "q0", "q1", "q2", "q3");
56 return;
57 } else if (gpu_stride == 16) {
58 void *cpu2 = cpu + 8;
59 __asm__ volatile (
60 /* Load from the GPU in one shot, no interleave, to
61 * d0-d7.
62 */
63 "vldm %[gpu], {q0, q1, q2, q3};\n"
64 /* Store each 16-byte line in 2 parts to the cpu-side
65 * destination. (vld1 can only store one d-register
66 * at a time).
67 */
68 "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69 "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70 "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71 "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72 "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73 "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74 "vst1.8 d6, [%[cpu]]\n"
75 "vst1.8 d7, [%[cpu2]]\n"
76 : [cpu] "+r"(cpu),
77 [cpu2] "+r"(cpu2)
78 : [gpu] "r"(gpu),
79 [cpu_stride] "r"(cpu_stride)
80 : "q0", "q1", "q2", "q3");
81 return;
82 }
83 #elif defined (PIPE_ARCH_AARCH64)
84 if (gpu_stride == 8) {
85 __asm__ volatile (
86 /* Load from the GPU in one shot, no interleave, to
87 * d0-d7.
88 */
89 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90 /* Store each 8-byte line to cpu-side destination,
91 * incrementing it by the stride each time.
92 */
93 "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94 "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95 "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96 "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97 "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98 "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99 "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100 "st1 {v3.D}[1], [%[cpu]]\n"
101 : [cpu] "+r"(cpu)
102 : [gpu] "r"(gpu),
103 [cpu_stride] "r"(cpu_stride)
104 : "v0", "v1", "v2", "v3");
105 return;
106 } else if (gpu_stride == 16) {
107 void *cpu2 = cpu + 8;
108 __asm__ volatile (
109 /* Load from the GPU in one shot, no interleave, to
110 * d0-d7.
111 */
112 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113 /* Store each 16-byte line in 2 parts to the cpu-side
114 * destination. (vld1 can only store one d-register
115 * at a time).
116 */
117 "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118 "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119 "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120 "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121 "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122 "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123 "st1 {v3.D}[0], [%[cpu]]\n"
124 "st1 {v3.D}[1], [%[cpu2]]\n"
125 : [cpu] "+r"(cpu),
126 [cpu2] "+r"(cpu2)
127 : [gpu] "r"(gpu),
128 [cpu_stride] "r"(cpu_stride)
129 : "v0", "v1", "v2", "v3");
130 return;
131 }
132 #endif
133
134 for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135 memcpy(cpu, gpu + gpu_offset, gpu_stride);
136 cpu += cpu_stride;
137 }
138 }
139
140 static inline void
141 v3d_store_utile(void *gpu, uint32_t gpu_stride,
142 void *cpu, uint32_t cpu_stride)
143 {
144 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145 if (gpu_stride == 8) {
146 __asm__ volatile (
147 /* Load each 8-byte line from cpu-side source,
148 * incrementing it by the stride each time.
149 */
150 "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151 "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152 "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153 "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154 "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155 "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156 "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157 "vld1.8 d7, [%[cpu]]\n"
158 /* Load from the GPU in one shot, no interleave, to
159 * d0-d7.
160 */
161 "vstm %[gpu], {q0, q1, q2, q3}\n"
162 : [cpu] "+r"(cpu)
163 : [gpu] "r"(gpu),
164 [cpu_stride] "r"(cpu_stride)
165 : "q0", "q1", "q2", "q3");
166 return;
167 } else if (gpu_stride == 16) {
168 void *cpu2 = cpu + 8;
169 __asm__ volatile (
170 /* Load each 16-byte line in 2 parts from the cpu-side
171 * destination. (vld1 can only store one d-register
172 * at a time).
173 */
174 "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175 "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176 "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177 "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178 "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179 "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180 "vld1.8 d6, [%[cpu]]\n"
181 "vld1.8 d7, [%[cpu2]]\n"
182 /* Store to the GPU in one shot, no interleave. */
183 "vstm %[gpu], {q0, q1, q2, q3}\n"
184 : [cpu] "+r"(cpu),
185 [cpu2] "+r"(cpu2)
186 : [gpu] "r"(gpu),
187 [cpu_stride] "r"(cpu_stride)
188 : "q0", "q1", "q2", "q3");
189 return;
190 }
191 #elif defined (PIPE_ARCH_AARCH64)
192 if (gpu_stride == 8) {
193 __asm__ volatile (
194 /* Load each 8-byte line from cpu-side source,
195 * incrementing it by the stride each time.
196 */
197 "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198 "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199 "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200 "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201 "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202 "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203 "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204 "ld1 {v3.D}[1], [%[cpu]]\n"
205 /* Store to the GPU in one shot, no interleave. */
206 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207 : [cpu] "+r"(cpu)
208 : [gpu] "r"(gpu),
209 [cpu_stride] "r"(cpu_stride)
210 : "v0", "v1", "v2", "v3");
211 return;
212 } else if (gpu_stride == 16) {
213 void *cpu2 = cpu + 8;
214 __asm__ volatile (
215 /* Load each 16-byte line in 2 parts from the cpu-side
216 * destination. (vld1 can only store one d-register
217 * at a time).
218 */
219 "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220 "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221 "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222 "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223 "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224 "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225 "ld1 {v3.D}[0], [%[cpu]]\n"
226 "ld1 {v3.D}[1], [%[cpu2]]\n"
227 /* Store to the GPU in one shot, no interleave. */
228 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229 : [cpu] "+r"(cpu),
230 [cpu2] "+r"(cpu2)
231 : [gpu] "r"(gpu),
232 [cpu_stride] "r"(cpu_stride)
233 : "v0", "v1", "v2", "v3");
234 return;
235 }
236 #endif
237
238 for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239 memcpy(gpu + gpu_offset, cpu, gpu_stride);
240 cpu += cpu_stride;
241 }
242 }