panfrost: Make sure the shader descriptor is in sync with the GL state
[mesa.git] / src / gallium / drivers / panfrost / pan_instancing.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig
3 * Copyright (C) 2019 Collabora, Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 */
25
26 #include "pan_bo.h"
27 #include "pan_context.h"
28
29 /* See mali_job for notes on how this works. But basically, for small vertex
30 * counts, we have a lookup table, and for large vertex counts, we look at the
31 * high bits as a heuristic. This has to match exactly how the hardware
32 * calculates this (which is why the algorithm is so weird) or else instancing
33 * will break. */
34
35 /* Given an odd number (of the form 2k + 1), compute k */
36 #define ODD(odd) ((odd - 1) >> 1)
37
38 /* Given the shift/odd pair, recover the original padded integer */
39
40 unsigned
41 pan_expand_shift_odd(struct pan_shift_odd o)
42 {
43 unsigned odd = 2*o.odd + 1;
44 unsigned shift = 1 << o.shift;
45 return odd * shift;
46 }
47
48 static inline struct pan_shift_odd
49 pan_factored(unsigned pot, unsigned odd)
50 {
51 struct pan_shift_odd out;
52
53 assert(util_is_power_of_two_or_zero(pot));
54 assert(odd & 1);
55
56 /* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1.
57 *
58 * So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1)
59 * = k | 0 = k */
60
61 out.odd = (odd >> 1);
62
63 /* POT is the form (1 << shift) */
64 out.shift = __builtin_ctz(pot);
65
66 return out;
67 }
68
69
70 /* For small vertices. Second argument is whether the primitive takes a
71 * power-of-two argument, which determines how rounding works. True for POINTS
72 * and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy
73 * to try instanced quads on ES class hardware <3 */
74
75 static struct {
76 unsigned pot;
77 unsigned odd;
78 } small_lut[] = {
79 { 0, 1 },
80 { 1, 1 },
81 { 2, 1 },
82 { 1, 3 },
83 { 4, 1 },
84 { 1, 5 },
85 { 2, 3 },
86 { 1, 7 },
87 { 8, 1 },
88 { 1, 9 },
89 { 2, 5 },
90 { 4, 3 }, /* 11 */
91 { 4, 3 },
92 { 2, 7 }, /* 13 */
93 { 2, 7 },
94 { 16, 1 }, /* 15 */
95 { 16, 1 },
96 { 2, 9 },
97 { 4, 5 }, /* 20 */
98 { 4, 5 }
99 };
100
101 static struct pan_shift_odd
102 panfrost_small_padded_vertex_count(unsigned idx)
103 {
104 return pan_factored(
105 small_lut[idx].pot,
106 small_lut[idx].odd);
107 }
108
109 static struct pan_shift_odd
110 panfrost_large_padded_vertex_count(uint32_t vertex_count)
111 {
112 struct pan_shift_odd out = { 0 };
113
114 /* First, we have to find the highest set one */
115 unsigned highest = 32 - __builtin_clz(vertex_count);
116
117 /* Using that, we mask out the highest 4-bits */
118 unsigned n = highest - 4;
119 unsigned nibble = (vertex_count >> n) & 0xF;
120
121 /* Great, we have the nibble. Now we can just try possibilities. Note
122 * that we don't care about the bottom most bit in most cases, and we
123 * know the top bit must be 1 */
124
125 unsigned middle_two = (nibble >> 1) & 0x3;
126
127 switch (middle_two) {
128 case 0b00:
129 if (nibble & 1)
130 return pan_factored(1 << n, 9);
131 else
132 return pan_factored(1 << (n + 1), 5);
133 case 0b01:
134 return pan_factored(1 << (n + 2), 3);
135 case 0b10:
136 return pan_factored(1 << (n + 1), 7);
137 case 0b11:
138 return pan_factored(1 << (n + 4), 1);
139 default:
140 unreachable("Invalid two bits");
141 }
142
143 return out;
144 }
145
146 struct pan_shift_odd
147 panfrost_padded_vertex_count(
148 unsigned vertex_count,
149 bool pot)
150 {
151 assert(vertex_count > 0);
152
153 if (vertex_count < 20) {
154 /* Add an off-by-one if it won't align naturally (quirk of the hardware) */
155 //if (!pot)
156 // vertex_count++;
157
158 return panfrost_small_padded_vertex_count(vertex_count);
159 } else
160 return panfrost_large_padded_vertex_count(vertex_count);
161 }
162
163 /* The much, much more irritating case -- instancing is enabled. See
164 * panfrost_job.h for notes on how this works */
165
166 static unsigned
167 panfrost_vertex_instanced(
168 struct panfrost_batch *batch,
169 struct panfrost_resource *rsrc,
170 unsigned divisor,
171 union mali_attr *attrs,
172 mali_ptr addr,
173 unsigned vertex_count,
174 unsigned instance_count)
175 {
176 /* First, grab the padded vertex count */
177
178 struct pan_shift_odd o = {
179 .shift = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift,
180 .odd = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd,
181 };
182
183 unsigned padded_count = batch->ctx->padded_count;
184
185 /* Depending if there is an instance divisor or not, packing varies.
186 * When there is a divisor, the hardware-level divisor is actually the
187 * product of the instance divisor and the padded count */
188
189 unsigned hw_divisor = padded_count * divisor;
190
191 if (divisor == 0) {
192 /* Per-vertex attributes use the MODULO mode. First, compute
193 * the modulus */
194
195 attrs->elements |= MALI_ATTR_MODULO;
196 attrs->shift = o.shift;
197 attrs->extra_flags = o.odd;
198
199 return 1;
200 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
201 /* If there is a divisor but the hardware divisor works out to
202 * a power of two (not terribly exceptional), we can use an
203 * easy path (just shifting) */
204
205 attrs->elements |= MALI_ATTR_POT_DIVIDE;
206 attrs->shift = __builtin_ctz(hw_divisor);
207
208 return 1;
209 } else {
210 /* We have a NPOT divisor. Here's the fun one (multipling by
211 * the inverse and shifting) */
212
213 /* floor(log2(d)) */
214 unsigned shift = util_logbase2(hw_divisor);
215
216 /* m = ceil(2^(32 + shift) / d) */
217 uint64_t shift_hi = 32 + shift;
218 uint64_t t = 1ll << shift_hi;
219 double t_f = t;
220 double hw_divisor_d = hw_divisor;
221 double m_f = ceil(t_f / hw_divisor_d);
222 unsigned m = m_f;
223
224 /* Default case */
225 uint32_t magic_divisor = m, extra_flags = 0;
226
227 /* e = 2^(shift + 32) % d */
228 uint64_t e = t % hw_divisor;
229
230 /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
231 * seems to use a different condition */
232 if (e <= (1ll << shift)) {
233 magic_divisor = m - 1;
234 extra_flags = 1;
235 }
236
237 /* Top flag implicitly set */
238 assert(magic_divisor & (1u << 31));
239 magic_divisor &= ~(1u << 31);
240
241 /* Upload to two different slots */
242
243 attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
244 attrs[0].shift = shift;
245 attrs[0].extra_flags = extra_flags;
246
247 attrs[1].unk = 0x20;
248 attrs[1].magic_divisor = magic_divisor;
249 attrs[1].zero = 0;
250 attrs[1].divisor = divisor;
251
252 return 2;
253 }
254 }
255
256 void
257 panfrost_emit_vertex_data(struct panfrost_batch *batch)
258 {
259 struct panfrost_context *ctx = batch->ctx;
260 struct panfrost_vertex_state *so = ctx->vertex;
261
262 /* Staged mali_attr, and index into them. i =/= k, depending on the
263 * vertex buffer mask and instancing. Twice as much room is allocated,
264 * for a worst case of NPOT_DIVIDEs which take up extra slot */
265 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
266 unsigned k = 0;
267
268 unsigned vertex_count = ctx->vertex_count;
269 unsigned instanced_count = ctx->instance_count;
270
271 for (unsigned i = 0; i < so->num_elements; ++i) {
272 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
273 * means duplicating some vertex buffers (who cares? aside from
274 * maybe some caching implications but I somehow doubt that
275 * matters) */
276
277 struct pipe_vertex_element *elem = &so->pipe[i];
278 unsigned vbi = elem->vertex_buffer_index;
279
280 /* The exception to 1:1 mapping is that we can have multiple
281 * entries (NPOT divisors), so we fixup anyways */
282
283 so->hw[i].index = k;
284
285 if (!(ctx->vb_mask & (1 << vbi))) continue;
286
287 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
288 struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource);
289
290 if (!rsrc) continue;
291
292 /* Align to 64 bytes by masking off the lower bits. This
293 * will be adjusted back when we fixup the src_offset in
294 * mali_attr_meta */
295
296 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
297 mali_ptr addr = raw_addr & ~63;
298 unsigned chopped_addr = raw_addr - addr;
299
300 /* Add a dependency of the batch on the vertex buffer */
301 panfrost_batch_add_bo(batch, rsrc->bo,
302 PAN_BO_ACCESS_SHARED |
303 PAN_BO_ACCESS_READ |
304 PAN_BO_ACCESS_VERTEX_TILER);
305
306 /* Set common fields */
307 attrs[k].elements = addr;
308 attrs[k].stride = buf->stride;
309
310 /* Since we advanced the base pointer, we shrink the buffer
311 * size */
312 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
313
314 /* We need to add the extra size we masked off (for
315 * correctness) so the data doesn't get clamped away */
316 attrs[k].size += chopped_addr;
317
318 /* For non-instancing make sure we initialize */
319 attrs[k].shift = attrs[k].extra_flags = 0;
320
321 /* Instancing uses a dramatically different code path than
322 * linear, so dispatch for the actual emission now that the
323 * common code is finished */
324
325 unsigned divisor = elem->instance_divisor;
326
327 if (divisor && instanced_count == 1) {
328 /* Silly corner case where there's a divisor(=1) but
329 * there's no legitimate instancing. So we want *every*
330 * attribute to be the same. So set stride to zero so
331 * we don't go anywhere. */
332
333 attrs[k].size = attrs[k].stride + chopped_addr;
334 attrs[k].stride = 0;
335 attrs[k++].elements |= MALI_ATTR_LINEAR;
336 } else if (instanced_count <= 1) {
337 /* Normal, non-instanced attributes */
338 attrs[k++].elements |= MALI_ATTR_LINEAR;
339 } else {
340 k += panfrost_vertex_instanced(
341 batch, rsrc, divisor, &attrs[k], addr, vertex_count, instanced_count);
342 }
343 }
344
345 /* Upload whatever we emitted and go */
346
347 ctx->payloads[PIPE_SHADER_VERTEX].postfix.attributes =
348 panfrost_upload_transient(batch, attrs, k * sizeof(union mali_attr));
349 }
350
351