panfrost: Inline panfrost_vertex_instanced
[mesa.git] / src / panfrost / lib / pan_attributes.c
1 /*
2 * Copyright (C) 2019 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25 #include "util/u_math.h"
26 #include "midgard_pack.h"
27 #include "pan_encoder.h"
28
29 /* This file handles attribute descriptors. The
30 * bulk of the complexity is from instancing. See mali_job for
31 * notes on how this works. But basically, for small vertex
32 * counts, we have a lookup table, and for large vertex counts,
33 * we look at the high bits as a heuristic. This has to match
34 * exactly how the hardware calculates this (which is why the
35 * algorithm is so weird) or else instancing will break. */
36
37 /* Given an odd number (of the form 2k + 1), compute k */
38 #define ODD(odd) ((odd - 1) >> 1)
39
40 static unsigned
41 panfrost_small_padded_vertex_count(unsigned idx)
42 {
43 if (idx == 11 || idx == 13 || idx == 15 || idx == 19)
44 return idx + 1;
45 else
46 return idx;
47 }
48
49 static unsigned
50 panfrost_large_padded_vertex_count(uint32_t vertex_count)
51 {
52 /* First, we have to find the highest set one */
53 unsigned highest = 32 - __builtin_clz(vertex_count);
54
55 /* Using that, we mask out the highest 4-bits */
56 unsigned n = highest - 4;
57 unsigned nibble = (vertex_count >> n) & 0xF;
58
59 /* Great, we have the nibble. Now we can just try possibilities. Note
60 * that we don't care about the bottom most bit in most cases, and we
61 * know the top bit must be 1 */
62
63 unsigned middle_two = (nibble >> 1) & 0x3;
64
65 switch (middle_two) {
66 case 0b00:
67 if (!(nibble & 1))
68 return (1 << n) * 9;
69 else
70 return (1 << (n + 1)) * 5;
71 case 0b01:
72 return (1 << (n + 2)) * 3;
73 case 0b10:
74 return (1 << (n + 1)) * 7;
75 case 0b11:
76 return (1 << (n + 4));
77 default:
78 return 0; /* unreachable */
79 }
80 }
81
82 unsigned
83 panfrost_padded_vertex_count(unsigned vertex_count)
84 {
85 if (vertex_count < 20)
86 return panfrost_small_padded_vertex_count(vertex_count);
87 else
88 return panfrost_large_padded_vertex_count(vertex_count);
89 }
90
91 /* The much, much more irritating case -- instancing is enabled. See
92 * panfrost_job.h for notes on how this works */
93
94 unsigned
95 panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags)
96 {
97 /* We have a NPOT divisor. Here's the fun one (multipling by
98 * the inverse and shifting) */
99
100 /* floor(log2(d)) */
101 unsigned shift = util_logbase2(hw_divisor);
102
103 /* m = ceil(2^(32 + shift) / d) */
104 uint64_t shift_hi = 32 + shift;
105 uint64_t t = 1ll << shift_hi;
106 double t_f = t;
107 double hw_divisor_d = hw_divisor;
108 double m_f = ceil(t_f / hw_divisor_d);
109 unsigned m = m_f;
110
111 /* Default case */
112 uint32_t magic_divisor = m;
113
114 /* e = 2^(shift + 32) % d */
115 uint64_t e = t % hw_divisor;
116
117 /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
118 * seems to use a different condition */
119 if (e <= (1ll << shift)) {
120 magic_divisor = m - 1;
121 *extra_flags = 1;
122 }
123
124 /* Top flag implicitly set */
125 assert(magic_divisor & (1u << 31));
126 magic_divisor &= ~(1u << 31);
127 *o_shift = shift;
128
129 return magic_divisor;
130 }
131
132 /* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
133 * but the idea is the same */
134
135 void
136 panfrost_vertex_id(
137 unsigned padded_count,
138 union mali_attr *attr)
139 {
140 /* We factor the padded count as shift/odd and that's it */
141
142 attr->elements = MALI_ATTR_VERTEXID;
143 attr->shift = __builtin_ctz(padded_count);
144 attr->extra_flags = padded_count >> (attr->shift + 1);
145 attr->stride = attr->size = 0;
146 }
147
148 void
149 panfrost_instance_id(
150 unsigned padded_count,
151 union mali_attr *attr)
152 {
153 attr->elements = MALI_ATTR_INSTANCEID;
154 attr->stride = 0;
155 attr->extra_flags = 0;
156 attr->size = 0;
157
158 /* POT records have just a shift directly with an off-by-one for
159 * unclear reasons. NPOT records have a magic divisor smushed into the
160 * stride field (which is unused for these special records) */
161
162 if (util_is_power_of_two_or_zero(padded_count)) {
163 attr->shift = __builtin_ctz(padded_count) - 1;
164 } else {
165 unsigned shift = 0, flags = 0;
166
167 attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags);
168 attr->shift = shift;
169 attr->extra_flags = flags;
170 }
171 }
172