src/panfrost/encoder/pan_attributes.c

   1 /*
   2  * Copyright (C) 2019 Collabora, Ltd.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  */
  24
  25 #include "util/u_math.h"
  26 #include "panfrost-job.h"
  27 #include "pan_encoder.h"
  28
  29 /* This file handles attribute descriptors (mali_attr_meta). The
  30  * bulk of the complexity is from instancing. See mali_job for
  31  * notes on how this works. But basically, for small vertex
  32  * counts, we have a lookup table, and for large vertex counts,
  33  * we look at the high bits as a heuristic. This has to match
  34  * exactly how the hardware calculates this (which is why the
  35  * algorithm is so weird) or else instancing will break. */
  36
  37 /* Given an odd number (of the form 2k + 1), compute k */
  38 #define ODD(odd) ((odd - 1) >> 1)
  39
  40 static unsigned
  41 panfrost_small_padded_vertex_count(unsigned idx)
  42 {
  43         if (idx == 11 || idx == 13 || idx == 15 || idx == 19)
  44                 return idx + 1;
  45         else
  46                 return idx;
  47 }
  48
  49 static unsigned
  50 panfrost_large_padded_vertex_count(uint32_t vertex_count)
  51 {
  52         /* First, we have to find the highest set one */
  53         unsigned highest = 32 - __builtin_clz(vertex_count);
  54
  55         /* Using that, we mask out the highest 4-bits */
  56         unsigned n = highest - 4;
  57         unsigned nibble = (vertex_count >> n) & 0xF;
  58
  59         /* Great, we have the nibble. Now we can just try possibilities. Note
  60          * that we don't care about the bottom most bit in most cases, and we
  61          * know the top bit must be 1 */
  62
  63         unsigned middle_two = (nibble >> 1) & 0x3;
  64
  65         switch (middle_two) {
  66         case 0b00:
  67                 if (!(nibble & 1))
  68                         return (1 << n) * 9;
  69                 else
  70                         return (1 << (n + 1)) * 5;
  71         case 0b01:
  72                 return (1 << (n + 2)) * 3;
  73         case 0b10:
  74                 return (1 << (n + 1)) * 7;
  75         case 0b11:
  76                 return (1 << (n + 4));
  77         default:
  78                 return 0; /* unreachable */
  79         }
  80 }
  81
  82 unsigned
  83 panfrost_padded_vertex_count(unsigned vertex_count)
  84 {
  85         if (vertex_count < 20)
  86                 return panfrost_small_padded_vertex_count(vertex_count);
  87         else
  88                 return panfrost_large_padded_vertex_count(vertex_count);
  89 }
  90
  91 /* The much, much more irritating case -- instancing is enabled. See
  92  * panfrost_job.h for notes on how this works */
  93
  94 static unsigned
  95 panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags)
  96 {
  97         /* We have a NPOT divisor. Here's the fun one (multipling by
  98          * the inverse and shifting) */
  99
 100         /* floor(log2(d)) */
 101         unsigned shift = util_logbase2(hw_divisor);
 102
 103         /* m = ceil(2^(32 + shift) / d) */
 104         uint64_t shift_hi = 32 + shift;
 105         uint64_t t = 1ll << shift_hi;
 106         double t_f = t;
 107         double hw_divisor_d = hw_divisor;
 108         double m_f = ceil(t_f / hw_divisor_d);
 109         unsigned m = m_f;
 110
 111         /* Default case */
 112         uint32_t magic_divisor = m;
 113
 114         /* e = 2^(shift + 32) % d */
 115         uint64_t e = t % hw_divisor;
 116
 117         /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
 118          * seems to use a different condition */
 119         if (e <= (1ll << shift)) {
 120                 magic_divisor = m - 1;
 121                 *extra_flags = 1;
 122         }
 123
 124         /* Top flag implicitly set */
 125         assert(magic_divisor & (1u << 31));
 126         magic_divisor &= ~(1u << 31);
 127         *o_shift = shift;
 128
 129         return magic_divisor;
 130 }
 131
 132 unsigned
 133 panfrost_vertex_instanced(
 134         unsigned padded_count,
 135         unsigned instance_shift, unsigned instance_odd,
 136         unsigned divisor,
 137         union mali_attr *attrs)
 138 {
 139         /* Depending if there is an instance divisor or not, packing varies.
 140          * When there is a divisor, the hardware-level divisor is actually the
 141          * product of the instance divisor and the padded count */
 142
 143         unsigned hw_divisor = padded_count * divisor;
 144
 145         if (divisor == 0) {
 146                 /* Per-vertex attributes use the MODULO mode. First, compute
 147                  * the modulus */
 148
 149                 attrs->elements |= MALI_ATTR_MODULO;
 150                 attrs->shift = instance_shift;
 151                 attrs->extra_flags = instance_odd;
 152
 153                 return 1;
 154         } else if (util_is_power_of_two_or_zero(hw_divisor)) {
 155                 /* If there is a divisor but the hardware divisor works out to
 156                  * a power of two (not terribly exceptional), we can use an
 157                  * easy path (just shifting) */
 158
 159                 attrs->elements |= MALI_ATTR_POT_DIVIDE;
 160                 attrs->shift = __builtin_ctz(hw_divisor);
 161
 162                 return 1;
 163         } else {
 164                 unsigned shift = 0, extra_flags = 0;
 165
 166                 attrs[1].magic_divisor =
 167                         panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
 168
 169                 /* Upload to two different slots */
 170
 171                 attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
 172                 attrs[0].shift = shift;
 173                 attrs[0].extra_flags = extra_flags;
 174
 175                 attrs[1].unk = 0x20;
 176                 attrs[1].zero = 0;
 177                 attrs[1].divisor = divisor;
 178
 179                 return 2;
 180         }
 181 }
 182
 183 /* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
 184  * but the idea is the same */
 185
 186 void
 187 panfrost_vertex_id(
 188         unsigned padded_count,
 189         union mali_attr *attr)
 190 {
 191         /* We factor the padded count as shift/odd and that's it */
 192
 193         attr->elements = MALI_ATTR_VERTEXID;
 194         attr->shift = __builtin_ctz(padded_count);
 195         attr->extra_flags = padded_count >> (attr->shift + 1);
 196         attr->stride = attr->size = 0;
 197 }
 198
 199 void
 200 panfrost_instance_id(
 201         unsigned padded_count,
 202         union mali_attr *attr)
 203 {
 204         attr->elements = MALI_ATTR_INSTANCEID;
 205         attr->stride = 0;
 206         attr->extra_flags = 0;
 207         attr->size = 0;
 208
 209         /* POT records have just a shift directly with an off-by-one for
 210          * unclear reasons. NPOT records have a magic divisor smushed into the
 211          * stride field (which is unused for these special records) */
 212
 213         if (util_is_power_of_two_or_zero(padded_count)) {
 214                 attr->shift = __builtin_ctz(padded_count) - 1;
 215         } else {
 216                 unsigned shift = 0, flags = 0;
 217
 218                 attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags);
 219                 attr->shift = shift;
 220                 attr->extra_flags = flags;
 221         }
 222 }
 223