src/gallium/drivers/panfrost/pan_instancing.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig
   3  * Copyright (C) 2019 Collabora, Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  *
  24  */
  25
  26 #include "pan_bo.h"
  27 #include "pan_context.h"
  28
  29 /* See mali_job for notes on how this works. But basically, for small vertex
  30  * counts, we have a lookup table, and for large vertex counts, we look at the
  31  * high bits as a heuristic. This has to match exactly how the hardware
  32  * calculates this (which is why the algorithm is so weird) or else instancing
  33  * will break. */
  34
  35 /* Given an odd number (of the form 2k + 1), compute k */
  36 #define ODD(odd) ((odd - 1) >> 1)
  37
  38 /* Given the shift/odd pair, recover the original padded integer */
  39
  40 unsigned
  41 pan_expand_shift_odd(struct pan_shift_odd o)
  42 {
  43         unsigned odd = 2*o.odd + 1;
  44         unsigned shift = 1 << o.shift;
  45         return odd * shift;
  46 }
  47
  48 static inline struct pan_shift_odd
  49 pan_factored(unsigned pot, unsigned odd)
  50 {
  51         struct pan_shift_odd out;
  52
  53         assert(util_is_power_of_two_or_zero(pot));
  54         assert(odd & 1);
  55
  56         /* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1.
  57          *
  58          * So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1)
  59          *  = k | 0 = k */
  60
  61         out.odd = (odd >> 1);
  62
  63         /* POT is the form (1 << shift) */
  64         out.shift = __builtin_ctz(pot);
  65
  66         return out;
  67 }
  68
  69
  70 /* For small vertices. Second argument is whether the primitive takes a
  71  * power-of-two argument, which determines how rounding works. True for POINTS
  72  * and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy
  73  * to try instanced quads on ES class hardware <3 */
  74
  75 static struct {
  76         unsigned pot;
  77         unsigned odd;
  78 } small_lut[] = {
  79         {  0, 1 },
  80         {  1, 1 },
  81         {  2, 1 },
  82         {  1, 3 },
  83         {  4, 1 },
  84         {  1, 5 },
  85         {  2, 3 },
  86         {  1, 7 },
  87         {  8, 1 },
  88         {  1, 9 },
  89         {  2, 5 },
  90         {  4, 3 }, /* 11 */
  91         {  4, 3 },
  92         {  2, 7 }, /* 13 */
  93         {  2, 7 },
  94         { 16, 1 }, /* 15 */
  95         { 16, 1 },
  96         {  2, 9 },
  97         {  4, 5 }, /* 20 */
  98         {  4, 5 }
  99 };
 100
 101 static struct pan_shift_odd
 102 panfrost_small_padded_vertex_count(unsigned idx)
 103 {
 104         return pan_factored(
 105                        small_lut[idx].pot,
 106                        small_lut[idx].odd);
 107 }
 108
 109 static struct pan_shift_odd
 110 panfrost_large_padded_vertex_count(uint32_t vertex_count)
 111 {
 112         struct pan_shift_odd out = { 0 };
 113
 114         /* First, we have to find the highest set one */
 115         unsigned highest = 32 - __builtin_clz(vertex_count);
 116
 117         /* Using that, we mask out the highest 4-bits */
 118         unsigned n = highest - 4;
 119         unsigned nibble = (vertex_count >> n) & 0xF;
 120
 121         /* Great, we have the nibble. Now we can just try possibilities. Note
 122          * that we don't care about the bottom most bit in most cases, and we
 123          * know the top bit must be 1 */
 124
 125         unsigned middle_two = (nibble >> 1) & 0x3;
 126
 127         switch (middle_two) {
 128         case 0b00:
 129                 if (nibble & 1)
 130                         return pan_factored(1 << n, 9);
 131                 else
 132                         return pan_factored(1 << (n + 1), 5);
 133         case 0b01:
 134                 return pan_factored(1 << (n + 2), 3);
 135         case 0b10:
 136                 return pan_factored(1 << (n + 1), 7);
 137         case 0b11:
 138                 return pan_factored(1 << (n + 4), 1);
 139         default:
 140                 unreachable("Invalid two bits");
 141         }
 142
 143         return out;
 144 }
 145
 146 struct pan_shift_odd
 147 panfrost_padded_vertex_count(
 148         unsigned vertex_count,
 149         bool pot)
 150 {
 151         assert(vertex_count > 0);
 152
 153         if (vertex_count < 20) {
 154                 /* Add an off-by-one if it won't align naturally (quirk of the hardware) */
 155                 //if (!pot)
 156                 //      vertex_count++;
 157
 158                 return panfrost_small_padded_vertex_count(vertex_count);
 159         } else
 160                 return panfrost_large_padded_vertex_count(vertex_count);
 161 }
 162
 163 /* The much, much more irritating case -- instancing is enabled. See
 164  * panfrost_job.h for notes on how this works */
 165
 166 static unsigned
 167 panfrost_vertex_instanced(
 168         struct panfrost_batch *batch,
 169         struct panfrost_resource *rsrc,
 170         unsigned divisor,
 171         union mali_attr *attrs,
 172         mali_ptr addr,
 173         unsigned vertex_count,
 174         unsigned instance_count)
 175 {
 176         /* First, grab the padded vertex count */
 177
 178         struct pan_shift_odd o = {
 179                 .shift = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift,
 180                 .odd = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd,
 181         };
 182
 183         unsigned padded_count = batch->ctx->padded_count;
 184
 185         /* Depending if there is an instance divisor or not, packing varies.
 186          * When there is a divisor, the hardware-level divisor is actually the
 187          * product of the instance divisor and the padded count */
 188
 189         unsigned hw_divisor = padded_count * divisor;
 190
 191         if (divisor == 0) {
 192                 /* Per-vertex attributes use the MODULO mode. First, compute
 193                  * the modulus */
 194
 195                 attrs->elements |= MALI_ATTR_MODULO;
 196                 attrs->shift = o.shift;
 197                 attrs->extra_flags = o.odd;
 198
 199                 return 1;
 200         } else if (util_is_power_of_two_or_zero(hw_divisor)) {
 201                 /* If there is a divisor but the hardware divisor works out to
 202                  * a power of two (not terribly exceptional), we can use an
 203                  * easy path (just shifting) */
 204
 205                 attrs->elements |= MALI_ATTR_POT_DIVIDE;
 206                 attrs->shift = __builtin_ctz(hw_divisor);
 207
 208                 return 1;
 209         } else {
 210                 /* We have a NPOT divisor. Here's the fun one (multipling by
 211                  * the inverse and shifting) */
 212
 213                 /* floor(log2(d)) */
 214                 unsigned shift = util_logbase2(hw_divisor);
 215
 216                 /* m = ceil(2^(32 + shift) / d) */
 217                 uint64_t shift_hi = 32 + shift;
 218                 uint64_t t = 1ll << shift_hi;
 219                 double t_f = t;
 220                 double hw_divisor_d = hw_divisor;
 221                 double m_f = ceil(t_f / hw_divisor_d);
 222                 unsigned m = m_f;
 223
 224                 /* Default case */
 225                 uint32_t magic_divisor = m, extra_flags = 0;
 226
 227                 /* e = 2^(shift + 32) % d */
 228                 uint64_t e = t % hw_divisor;
 229
 230                 /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
 231                  * seems to use a different condition */
 232                 if (e <= (1ll << shift)) {
 233                         magic_divisor = m - 1;
 234                         extra_flags = 1;
 235                 }
 236
 237                 /* Top flag implicitly set */
 238                 assert(magic_divisor & (1u << 31));
 239                 magic_divisor &= ~(1u << 31);
 240
 241                 /* Upload to two different slots */
 242
 243                 attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
 244                 attrs[0].shift = shift;
 245                 attrs[0].extra_flags = extra_flags;
 246
 247                 attrs[1].unk = 0x20;
 248                 attrs[1].magic_divisor = magic_divisor;
 249                 attrs[1].zero = 0;
 250                 attrs[1].divisor = divisor;
 251
 252                 return 2;
 253         }
 254 }
 255
 256 void
 257 panfrost_emit_vertex_data(struct panfrost_batch *batch)
 258 {
 259         struct panfrost_context *ctx = batch->ctx;
 260         struct panfrost_vertex_state *so = ctx->vertex;
 261
 262         /* Staged mali_attr, and index into them. i =/= k, depending on the
 263          * vertex buffer mask and instancing. Twice as much room is allocated,
 264          * for a worst case of NPOT_DIVIDEs which take up extra slot */
 265         union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
 266         unsigned k = 0;
 267
 268         unsigned vertex_count = ctx->vertex_count;
 269         unsigned instanced_count = ctx->instance_count;
 270
 271         for (unsigned i = 0; i < so->num_elements; ++i) {
 272                 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
 273                  * means duplicating some vertex buffers (who cares? aside from
 274                  * maybe some caching implications but I somehow doubt that
 275                  * matters) */
 276
 277                 struct pipe_vertex_element *elem = &so->pipe[i];
 278                 unsigned vbi = elem->vertex_buffer_index;
 279
 280                 /* The exception to 1:1 mapping is that we can have multiple
 281                  * entries (NPOT divisors), so we fixup anyways */
 282
 283                 so->hw[i].index = k;
 284
 285                 if (!(ctx->vb_mask & (1 << vbi))) continue;
 286
 287                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
 288                 struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource);
 289
 290                 if (!rsrc) continue;
 291
 292                 /* Align to 64 bytes by masking off the lower bits. This
 293                  * will be adjusted back when we fixup the src_offset in
 294                  * mali_attr_meta */
 295
 296                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
 297                 mali_ptr addr = raw_addr & ~63;
 298                 unsigned chopped_addr = raw_addr - addr;
 299
 300                 /* Add a dependency of the batch on the vertex buffer */
 301                 panfrost_batch_add_bo(batch, rsrc->bo,
 302                                       PAN_BO_ACCESS_SHARED |
 303                                       PAN_BO_ACCESS_READ |
 304                                       PAN_BO_ACCESS_VERTEX_TILER);
 305
 306                 /* Set common fields */
 307                 attrs[k].elements = addr;
 308                 attrs[k].stride = buf->stride;
 309
 310                 /* Since we advanced the base pointer, we shrink the buffer
 311                  * size */
 312                 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
 313
 314                 /* We need to add the extra size we masked off (for
 315                  * correctness) so the data doesn't get clamped away */
 316                 attrs[k].size += chopped_addr;
 317
 318                 /* For non-instancing make sure we initialize */
 319                 attrs[k].shift = attrs[k].extra_flags = 0;
 320
 321                 /* Instancing uses a dramatically different code path than
 322                  * linear, so dispatch for the actual emission now that the
 323                  * common code is finished */
 324
 325                 unsigned divisor = elem->instance_divisor;
 326
 327                 if (divisor && instanced_count == 1) {
 328                         /* Silly corner case where there's a divisor(=1) but
 329                          * there's no legitimate instancing. So we want *every*
 330                          * attribute to be the same. So set stride to zero so
 331                          * we don't go anywhere. */
 332
 333                         attrs[k].size = attrs[k].stride + chopped_addr;
 334                         attrs[k].stride = 0;
 335                         attrs[k++].elements |= MALI_ATTR_LINEAR;
 336                 } else if (instanced_count <= 1) {
 337                         /* Normal, non-instanced attributes */
 338                         attrs[k++].elements |= MALI_ATTR_LINEAR;
 339                 } else {
 340                         k += panfrost_vertex_instanced(
 341                                      batch, rsrc, divisor, &attrs[k], addr, vertex_count, instanced_count);
 342                 }
 343         }
 344
 345         /* Upload whatever we emitted and go */
 346
 347         ctx->payloads[PIPE_SHADER_VERTEX].postfix.attributes =
 348                 panfrost_upload_transient(batch, attrs, k * sizeof(union mali_attr));
 349 }
 350
 351