2 * Copyright (C) 2019 Collabora, Ltd.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "util/u_math.h"
26 #include "panfrost-job.h"
27 #include "pan_encoder.h"
29 /* This file handles attribute descriptors (mali_attr_meta). The
30 * bulk of the complexity is from instancing. See mali_job for
31 * notes on how this works. But basically, for small vertex
32 * counts, we have a lookup table, and for large vertex counts,
33 * we look at the high bits as a heuristic. This has to match
34 * exactly how the hardware calculates this (which is why the
35 * algorithm is so weird) or else instancing will break. */
37 /* Given an odd number (of the form 2k + 1), compute k */
38 #define ODD(odd) ((odd - 1) >> 1)
41 panfrost_small_padded_vertex_count(unsigned idx
)
43 if (idx
== 11 || idx
== 13 || idx
== 15 || idx
== 19)
50 panfrost_large_padded_vertex_count(uint32_t vertex_count
)
52 /* First, we have to find the highest set one */
53 unsigned highest
= 32 - __builtin_clz(vertex_count
);
55 /* Using that, we mask out the highest 4-bits */
56 unsigned n
= highest
- 4;
57 unsigned nibble
= (vertex_count
>> n
) & 0xF;
59 /* Great, we have the nibble. Now we can just try possibilities. Note
60 * that we don't care about the bottom most bit in most cases, and we
61 * know the top bit must be 1 */
63 unsigned middle_two
= (nibble
>> 1) & 0x3;
70 return (1 << (n
+ 1)) * 5;
72 return (1 << (n
+ 2)) * 3;
74 return (1 << (n
+ 1)) * 7;
76 return (1 << (n
+ 4));
78 return 0; /* unreachable */
83 panfrost_padded_vertex_count(unsigned vertex_count
)
85 if (vertex_count
< 20)
86 return panfrost_small_padded_vertex_count(vertex_count
);
88 return panfrost_large_padded_vertex_count(vertex_count
);
91 /* The much, much more irritating case -- instancing is enabled. See
92 * panfrost_job.h for notes on how this works */
95 panfrost_compute_magic_divisor(unsigned hw_divisor
, unsigned *o_shift
, unsigned *extra_flags
)
97 /* We have a NPOT divisor. Here's the fun one (multipling by
98 * the inverse and shifting) */
101 unsigned shift
= util_logbase2(hw_divisor
);
103 /* m = ceil(2^(32 + shift) / d) */
104 uint64_t shift_hi
= 32 + shift
;
105 uint64_t t
= 1ll << shift_hi
;
107 double hw_divisor_d
= hw_divisor
;
108 double m_f
= ceil(t_f
/ hw_divisor_d
);
112 uint32_t magic_divisor
= m
;
114 /* e = 2^(shift + 32) % d */
115 uint64_t e
= t
% hw_divisor
;
117 /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
118 * seems to use a different condition */
119 if (e
<= (1ll << shift
)) {
120 magic_divisor
= m
- 1;
124 /* Top flag implicitly set */
125 assert(magic_divisor
& (1u << 31));
126 magic_divisor
&= ~(1u << 31);
129 return magic_divisor
;
133 panfrost_vertex_instanced(
134 unsigned padded_count
,
135 unsigned instance_shift
, unsigned instance_odd
,
137 union mali_attr
*attrs
)
139 /* Depending if there is an instance divisor or not, packing varies.
140 * When there is a divisor, the hardware-level divisor is actually the
141 * product of the instance divisor and the padded count */
143 unsigned hw_divisor
= padded_count
* divisor
;
146 /* Per-vertex attributes use the MODULO mode. First, compute
149 attrs
->elements
|= MALI_ATTR_MODULO
;
150 attrs
->shift
= instance_shift
;
151 attrs
->extra_flags
= instance_odd
;
154 } else if (util_is_power_of_two_or_zero(hw_divisor
)) {
155 /* If there is a divisor but the hardware divisor works out to
156 * a power of two (not terribly exceptional), we can use an
157 * easy path (just shifting) */
159 attrs
->elements
|= MALI_ATTR_POT_DIVIDE
;
160 attrs
->shift
= __builtin_ctz(hw_divisor
);
164 unsigned shift
= 0, extra_flags
= 0;
166 attrs
[1].magic_divisor
=
167 panfrost_compute_magic_divisor(hw_divisor
, &shift
, &extra_flags
);
169 /* Upload to two different slots */
171 attrs
[0].elements
|= MALI_ATTR_NPOT_DIVIDE
;
172 attrs
[0].shift
= shift
;
173 attrs
[0].extra_flags
= extra_flags
;
177 attrs
[1].divisor
= divisor
;
183 /* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
184 * but the idea is the same */
188 unsigned padded_count
,
189 union mali_attr
*attr
)
191 /* We factor the padded count as shift/odd and that's it */
193 attr
->elements
= MALI_ATTR_VERTEXID
;
194 attr
->shift
= __builtin_ctz(padded_count
);
195 attr
->extra_flags
= padded_count
>> (attr
->shift
+ 1);
196 attr
->stride
= attr
->size
= 0;
200 panfrost_instance_id(
201 unsigned padded_count
,
202 union mali_attr
*attr
)
204 attr
->elements
= MALI_ATTR_INSTANCEID
;
206 attr
->extra_flags
= 0;
209 /* POT records have just a shift directly with an off-by-one for
210 * unclear reasons. NPOT records have a magic divisor smushed into the
211 * stride field (which is unused for these special records) */
213 if (util_is_power_of_two_or_zero(padded_count
)) {
214 attr
->shift
= __builtin_ctz(padded_count
) - 1;
216 unsigned shift
= 0, flags
= 0;
218 attr
->stride
= panfrost_compute_magic_divisor(padded_count
, &shift
, &flags
);
220 attr
->extra_flags
= flags
;