v3d: Add a "precompile" debug flag for shader-db.
[mesa.git] / src / broadcom / common / v3d_cpu_tiling.h
1 /*
2 * Copyright © 2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file v3d_cpu_tiling.h
25 *
26 * Contains load/store functions common to both v3d and vc4. The utile layout
27 * stayed the same, though the way utiles get laid out has changed.
28 */
29
30 static inline void
31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
32 void *gpu, uint32_t gpu_stride)
33 {
34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35 if (gpu_stride == 8) {
36 __asm__ volatile (
37 /* Load from the GPU in one shot, no interleave, to
38 * d0-d7.
39 */
40 "vldm %0, {q0, q1, q2, q3}\n"
41 /* Store each 8-byte line to cpu-side destination,
42 * incrementing it by the stride each time.
43 */
44 "vst1.8 d0, [%1], %2\n"
45 "vst1.8 d1, [%1], %2\n"
46 "vst1.8 d2, [%1], %2\n"
47 "vst1.8 d3, [%1], %2\n"
48 "vst1.8 d4, [%1], %2\n"
49 "vst1.8 d5, [%1], %2\n"
50 "vst1.8 d6, [%1], %2\n"
51 "vst1.8 d7, [%1]\n"
52 :
53 : "r"(gpu), "r"(cpu), "r"(cpu_stride)
54 : "q0", "q1", "q2", "q3");
55 return;
56 } else if (gpu_stride == 16) {
57 __asm__ volatile (
58 /* Load from the GPU in one shot, no interleave, to
59 * d0-d7.
60 */
61 "vldm %0, {q0, q1, q2, q3};\n"
62 /* Store each 16-byte line in 2 parts to the cpu-side
63 * destination. (vld1 can only store one d-register
64 * at a time).
65 */
66 "vst1.8 d0, [%1], %3\n"
67 "vst1.8 d1, [%2], %3\n"
68 "vst1.8 d2, [%1], %3\n"
69 "vst1.8 d3, [%2], %3\n"
70 "vst1.8 d4, [%1], %3\n"
71 "vst1.8 d5, [%2], %3\n"
72 "vst1.8 d6, [%1]\n"
73 "vst1.8 d7, [%2]\n"
74 :
75 : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
76 : "q0", "q1", "q2", "q3");
77 return;
78 }
79 #elif defined (PIPE_ARCH_AARCH64)
80 if (gpu_stride == 8) {
81 __asm__ volatile (
82 /* Load from the GPU in one shot, no interleave, to
83 * d0-d7.
84 */
85 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
86 /* Store each 8-byte line to cpu-side destination,
87 * incrementing it by the stride each time.
88 */
89 "st1 {v0.D}[0], [%1], %2\n"
90 "st1 {v0.D}[1], [%1], %2\n"
91 "st1 {v1.D}[0], [%1], %2\n"
92 "st1 {v1.D}[1], [%1], %2\n"
93 "st1 {v2.D}[0], [%1], %2\n"
94 "st1 {v2.D}[1], [%1], %2\n"
95 "st1 {v3.D}[0], [%1], %2\n"
96 "st1 {v3.D}[1], [%1]\n"
97 :
98 : "r"(gpu), "r"(cpu), "r"(cpu_stride)
99 : "v0", "v1", "v2", "v3");
100 return;
101 } else if (gpu_stride == 16) {
102 __asm__ volatile (
103 /* Load from the GPU in one shot, no interleave, to
104 * d0-d7.
105 */
106 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
107 /* Store each 16-byte line in 2 parts to the cpu-side
108 * destination. (vld1 can only store one d-register
109 * at a time).
110 */
111 "st1 {v0.D}[0], [%1], %3\n"
112 "st1 {v0.D}[1], [%2], %3\n"
113 "st1 {v1.D}[0], [%1], %3\n"
114 "st1 {v1.D}[1], [%2], %3\n"
115 "st1 {v2.D}[0], [%1], %3\n"
116 "st1 {v2.D}[1], [%2], %3\n"
117 "st1 {v3.D}[0], [%1]\n"
118 "st1 {v3.D}[1], [%2]\n"
119 :
120 : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
121 : "v0", "v1", "v2", "v3");
122 return;
123 }
124 #endif
125
126 for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
127 memcpy(cpu, gpu + gpu_offset, gpu_stride);
128 cpu += cpu_stride;
129 }
130 }
131
132 static inline void
133 v3d_store_utile(void *gpu, uint32_t gpu_stride,
134 void *cpu, uint32_t cpu_stride)
135 {
136 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
137 if (gpu_stride == 8) {
138 __asm__ volatile (
139 /* Load each 8-byte line from cpu-side source,
140 * incrementing it by the stride each time.
141 */
142 "vld1.8 d0, [%1], %2\n"
143 "vld1.8 d1, [%1], %2\n"
144 "vld1.8 d2, [%1], %2\n"
145 "vld1.8 d3, [%1], %2\n"
146 "vld1.8 d4, [%1], %2\n"
147 "vld1.8 d5, [%1], %2\n"
148 "vld1.8 d6, [%1], %2\n"
149 "vld1.8 d7, [%1]\n"
150 /* Load from the GPU in one shot, no interleave, to
151 * d0-d7.
152 */
153 "vstm %0, {q0, q1, q2, q3}\n"
154 :
155 : "r"(gpu), "r"(cpu), "r"(cpu_stride)
156 : "q0", "q1", "q2", "q3");
157 return;
158 } else if (gpu_stride == 16) {
159 __asm__ volatile (
160 /* Load each 16-byte line in 2 parts from the cpu-side
161 * destination. (vld1 can only store one d-register
162 * at a time).
163 */
164 "vld1.8 d0, [%1], %3\n"
165 "vld1.8 d1, [%2], %3\n"
166 "vld1.8 d2, [%1], %3\n"
167 "vld1.8 d3, [%2], %3\n"
168 "vld1.8 d4, [%1], %3\n"
169 "vld1.8 d5, [%2], %3\n"
170 "vld1.8 d6, [%1]\n"
171 "vld1.8 d7, [%2]\n"
172 /* Store to the GPU in one shot, no interleave. */
173 "vstm %0, {q0, q1, q2, q3}\n"
174 :
175 : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
176 : "q0", "q1", "q2", "q3");
177 return;
178 }
179 #elif defined (PIPE_ARCH_AARCH64)
180 if (gpu_stride == 8) {
181 __asm__ volatile (
182 /* Load each 8-byte line from cpu-side source,
183 * incrementing it by the stride each time.
184 */
185 "ld1 {v0.D}[0], [%1], %2\n"
186 "ld1 {v0.D}[1], [%1], %2\n"
187 "ld1 {v1.D}[0], [%1], %2\n"
188 "ld1 {v1.D}[1], [%1], %2\n"
189 "ld1 {v2.D}[0], [%1], %2\n"
190 "ld1 {v2.D}[1], [%1], %2\n"
191 "ld1 {v3.D}[0], [%1], %2\n"
192 "ld1 {v3.D}[1], [%1]\n"
193 /* Store to the GPU in one shot, no interleave. */
194 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
195 :
196 : "r"(gpu), "r"(cpu), "r"(cpu_stride)
197 : "v0", "v1", "v2", "v3");
198 return;
199 } else if (gpu_stride == 16) {
200 __asm__ volatile (
201 /* Load each 16-byte line in 2 parts from the cpu-side
202 * destination. (vld1 can only store one d-register
203 * at a time).
204 */
205 "ld1 {v0.D}[0], [%1], %3\n"
206 "ld1 {v0.D}[1], [%2], %3\n"
207 "ld1 {v1.D}[0], [%1], %3\n"
208 "ld1 {v1.D}[1], [%2], %3\n"
209 "ld1 {v2.D}[0], [%1], %3\n"
210 "ld1 {v2.D}[1], [%2], %3\n"
211 "ld1 {v3.D}[0], [%1]\n"
212 "ld1 {v3.D}[1], [%2]\n"
213 /* Store to the GPU in one shot, no interleave. */
214 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
215 :
216 : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
217 : "v0", "v1", "v2", "v3");
218 return;
219 }
220 #endif
221
222 for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
223 memcpy(gpu + gpu_offset, cpu, gpu_stride);
224 cpu += cpu_stride;
225 }
226 }