2 * Copyright (C) 2018-2019 Alyssa Rosenzweig
3 * Copyright (C) 2019 Collabora, Ltd.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #include "pan_context.h"
29 /* See mali_job for notes on how this works. But basically, for small vertex
30 * counts, we have a lookup table, and for large vertex counts, we look at the
31 * high bits as a heuristic. This has to match exactly how the hardware
32 * calculates this (which is why the algorithm is so weird) or else instancing
35 /* Given an odd number (of the form 2k + 1), compute k */
36 #define ODD(odd) ((odd - 1) >> 1)
38 /* Given the shift/odd pair, recover the original padded integer */
41 pan_expand_shift_odd(struct pan_shift_odd o
)
43 unsigned odd
= 2*o
.odd
+ 1;
44 unsigned shift
= 1 << o
.shift
;
48 static inline struct pan_shift_odd
49 pan_factored(unsigned pot
, unsigned odd
)
51 struct pan_shift_odd out
;
53 assert(util_is_power_of_two_or_zero(pot
));
56 /* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1.
58 * So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1)
63 /* POT is the form (1 << shift) */
64 out
.shift
= __builtin_ctz(pot
);
70 /* For small vertices. Second argument is whether the primitive takes a
71 * power-of-two argument, which determines how rounding works. True for POINTS
72 * and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy
73 * to try instanced quads on ES class hardware <3 */
101 static struct pan_shift_odd
102 panfrost_small_padded_vertex_count(unsigned idx
)
109 static struct pan_shift_odd
110 panfrost_large_padded_vertex_count(uint32_t vertex_count
)
112 struct pan_shift_odd out
= { 0 };
114 /* First, we have to find the highest set one */
115 unsigned highest
= 32 - __builtin_clz(vertex_count
);
117 /* Using that, we mask out the highest 4-bits */
118 unsigned n
= highest
- 4;
119 unsigned nibble
= (vertex_count
>> n
) & 0xF;
121 /* Great, we have the nibble. Now we can just try possibilities. Note
122 * that we don't care about the bottom most bit in most cases, and we
123 * know the top bit must be 1 */
125 unsigned middle_two
= (nibble
>> 1) & 0x3;
127 switch (middle_two
) {
130 return pan_factored(1 << n
, 9);
132 return pan_factored(1 << (n
+ 1), 5);
134 return pan_factored(1 << (n
+ 2), 3);
136 return pan_factored(1 << (n
+ 1), 7);
138 return pan_factored(1 << (n
+ 4), 1);
140 unreachable("Invalid two bits");
147 panfrost_padded_vertex_count(
148 unsigned vertex_count
,
151 assert(vertex_count
> 0);
153 if (vertex_count
< 20) {
154 /* Add an off-by-one if it won't align naturally (quirk of the hardware) */
158 return panfrost_small_padded_vertex_count(vertex_count
);
160 return panfrost_large_padded_vertex_count(vertex_count
);
163 /* The much, much more irritating case -- instancing is enabled. See
164 * panfrost_job.h for notes on how this works */
167 panfrost_vertex_instanced(
168 struct panfrost_batch
*batch
,
169 struct panfrost_resource
*rsrc
,
171 union mali_attr
*attrs
,
173 unsigned vertex_count
,
174 unsigned instance_count
)
176 /* First, grab the padded vertex count */
178 struct pan_shift_odd o
= {
179 .shift
= batch
->ctx
->payloads
[PIPE_SHADER_FRAGMENT
].instance_shift
,
180 .odd
= batch
->ctx
->payloads
[PIPE_SHADER_FRAGMENT
].instance_odd
,
183 unsigned padded_count
= batch
->ctx
->padded_count
;
185 /* Depending if there is an instance divisor or not, packing varies.
186 * When there is a divisor, the hardware-level divisor is actually the
187 * product of the instance divisor and the padded count */
189 unsigned hw_divisor
= padded_count
* divisor
;
192 /* Per-vertex attributes use the MODULO mode. First, compute
195 attrs
->elements
|= MALI_ATTR_MODULO
;
196 attrs
->shift
= o
.shift
;
197 attrs
->extra_flags
= o
.odd
;
200 } else if (util_is_power_of_two_or_zero(hw_divisor
)) {
201 /* If there is a divisor but the hardware divisor works out to
202 * a power of two (not terribly exceptional), we can use an
203 * easy path (just shifting) */
205 attrs
->elements
|= MALI_ATTR_POT_DIVIDE
;
206 attrs
->shift
= __builtin_ctz(hw_divisor
);
210 /* We have a NPOT divisor. Here's the fun one (multipling by
211 * the inverse and shifting) */
214 unsigned shift
= util_logbase2(hw_divisor
);
216 /* m = ceil(2^(32 + shift) / d) */
217 uint64_t shift_hi
= 32 + shift
;
218 uint64_t t
= 1ll << shift_hi
;
220 double hw_divisor_d
= hw_divisor
;
221 double m_f
= ceil(t_f
/ hw_divisor_d
);
225 uint32_t magic_divisor
= m
, extra_flags
= 0;
227 /* e = 2^(shift + 32) % d */
228 uint64_t e
= t
% hw_divisor
;
230 /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
231 * seems to use a different condition */
232 if (e
<= (1ll << shift
)) {
233 magic_divisor
= m
- 1;
237 /* Top flag implicitly set */
238 assert(magic_divisor
& (1u << 31));
239 magic_divisor
&= ~(1u << 31);
241 /* Upload to two different slots */
243 attrs
[0].elements
|= MALI_ATTR_NPOT_DIVIDE
;
244 attrs
[0].shift
= shift
;
245 attrs
[0].extra_flags
= extra_flags
;
248 attrs
[1].magic_divisor
= magic_divisor
;
250 attrs
[1].divisor
= divisor
;
257 panfrost_emit_vertex_data(struct panfrost_batch
*batch
)
259 struct panfrost_context
*ctx
= batch
->ctx
;
260 struct panfrost_vertex_state
*so
= ctx
->vertex
;
262 /* Staged mali_attr, and index into them. i =/= k, depending on the
263 * vertex buffer mask and instancing. Twice as much room is allocated,
264 * for a worst case of NPOT_DIVIDEs which take up extra slot */
265 union mali_attr attrs
[PIPE_MAX_ATTRIBS
* 2];
268 unsigned vertex_count
= ctx
->vertex_count
;
269 unsigned instanced_count
= ctx
->instance_count
;
271 for (unsigned i
= 0; i
< so
->num_elements
; ++i
) {
272 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
273 * means duplicating some vertex buffers (who cares? aside from
274 * maybe some caching implications but I somehow doubt that
277 struct pipe_vertex_element
*elem
= &so
->pipe
[i
];
278 unsigned vbi
= elem
->vertex_buffer_index
;
280 /* The exception to 1:1 mapping is that we can have multiple
281 * entries (NPOT divisors), so we fixup anyways */
285 if (!(ctx
->vb_mask
& (1 << vbi
))) continue;
287 struct pipe_vertex_buffer
*buf
= &ctx
->vertex_buffers
[vbi
];
288 struct panfrost_resource
*rsrc
= (struct panfrost_resource
*) (buf
->buffer
.resource
);
292 /* Align to 64 bytes by masking off the lower bits. This
293 * will be adjusted back when we fixup the src_offset in
296 mali_ptr raw_addr
= rsrc
->bo
->gpu
+ buf
->buffer_offset
;
297 mali_ptr addr
= raw_addr
& ~63;
298 unsigned chopped_addr
= raw_addr
- addr
;
300 /* Add a dependency of the batch on the vertex buffer */
301 panfrost_batch_add_bo(batch
, rsrc
->bo
,
302 PAN_BO_ACCESS_SHARED
|
304 PAN_BO_ACCESS_VERTEX_TILER
);
306 /* Set common fields */
307 attrs
[k
].elements
= addr
;
308 attrs
[k
].stride
= buf
->stride
;
310 /* Since we advanced the base pointer, we shrink the buffer
312 attrs
[k
].size
= rsrc
->base
.width0
- buf
->buffer_offset
;
314 /* We need to add the extra size we masked off (for
315 * correctness) so the data doesn't get clamped away */
316 attrs
[k
].size
+= chopped_addr
;
318 /* For non-instancing make sure we initialize */
319 attrs
[k
].shift
= attrs
[k
].extra_flags
= 0;
321 /* Instancing uses a dramatically different code path than
322 * linear, so dispatch for the actual emission now that the
323 * common code is finished */
325 unsigned divisor
= elem
->instance_divisor
;
327 if (divisor
&& instanced_count
== 1) {
328 /* Silly corner case where there's a divisor(=1) but
329 * there's no legitimate instancing. So we want *every*
330 * attribute to be the same. So set stride to zero so
331 * we don't go anywhere. */
333 attrs
[k
].size
= attrs
[k
].stride
+ chopped_addr
;
335 attrs
[k
++].elements
|= MALI_ATTR_LINEAR
;
336 } else if (instanced_count
<= 1) {
337 /* Normal, non-instanced attributes */
338 attrs
[k
++].elements
|= MALI_ATTR_LINEAR
;
340 k
+= panfrost_vertex_instanced(
341 batch
, rsrc
, divisor
, &attrs
[k
], addr
, vertex_count
, instanced_count
);
345 /* Upload whatever we emitted and go */
347 ctx
->payloads
[PIPE_SHADER_VERTEX
].postfix
.attributes
=
348 panfrost_upload_transient(batch
, attrs
, k
* sizeof(union mali_attr
));