src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28 #include "util/u_helpers.h"
  29
  30 #include "panfrost-quirks.h"
  31
  32 #include "pan_pool.h"
  33 #include "pan_bo.h"
  34 #include "pan_cmdstream.h"
  35 #include "pan_context.h"
  36 #include "pan_job.h"
  37
  38 /* If a BO is accessed for a particular shader stage, will it be in the primary
  39  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  40  * fragment will be primary, e.g. compute jobs will be considered
  41  * "vertex/tiler" by analogy */
  42
  43 static inline uint32_t
  44 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  45 {
  46         assert(stage == PIPE_SHADER_FRAGMENT ||
  47                stage == PIPE_SHADER_VERTEX ||
  48                stage == PIPE_SHADER_COMPUTE);
  49
  50         return stage == PIPE_SHADER_FRAGMENT ?
  51                PAN_BO_ACCESS_FRAGMENT :
  52                PAN_BO_ACCESS_VERTEX_TILER;
  53 }
  54
  55 mali_ptr
  56 panfrost_vt_emit_shared_memory(struct panfrost_batch *batch)
  57 {
  58         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
  59
  60         struct mali_shared_memory shared = {
  61                 .shared_workgroup_count = ~0,
  62         };
  63
  64         if (batch->stack_size) {
  65                 struct panfrost_bo *stack =
  66                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  67                                         dev->thread_tls_alloc,
  68                                         dev->core_count);
  69
  70                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  71                 shared.scratchpad = stack->gpu;
  72         }
  73
  74         return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  75 }
  76
  77 void
  78 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  79                                   struct mali_vertex_tiler_prefix *prefix,
  80                                   union midgard_primitive_size *primitive_size)
  81 {
  82         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
  83
  84         if (!panfrost_writes_point_size(ctx)) {
  85                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
  86                               rasterizer->base.point_size :
  87                               rasterizer->base.line_width;
  88
  89                 primitive_size->constant = val;
  90         }
  91 }
  92
  93 unsigned
  94 panfrost_translate_index_size(unsigned size)
  95 {
  96         switch (size) {
  97         case 1:
  98                 return MALI_DRAW_INDEXED_UINT8;
  99
 100         case 2:
 101                 return MALI_DRAW_INDEXED_UINT16;
 102
 103         case 4:
 104                 return MALI_DRAW_INDEXED_UINT32;
 105
 106         default:
 107                 unreachable("Invalid index size");
 108         }
 109 }
 110
 111 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 112  * good for the duration of the draw (transient), could last longer. Also get
 113  * the bounds on the index buffer for the range accessed by the draw. We do
 114  * these operations together because there are natural optimizations which
 115  * require them to be together. */
 116
 117 mali_ptr
 118 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 119                                   const struct pipe_draw_info *info,
 120                                   unsigned *min_index, unsigned *max_index)
 121 {
 122         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 123         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 124         off_t offset = info->start * info->index_size;
 125         bool needs_indices = true;
 126         mali_ptr out = 0;
 127
 128         if (info->max_index != ~0u) {
 129                 *min_index = info->min_index;
 130                 *max_index = info->max_index;
 131                 needs_indices = false;
 132         }
 133
 134         if (!info->has_user_indices) {
 135                 /* Only resources can be directly mapped */
 136                 panfrost_batch_add_bo(batch, rsrc->bo,
 137                                       PAN_BO_ACCESS_SHARED |
 138                                       PAN_BO_ACCESS_READ |
 139                                       PAN_BO_ACCESS_VERTEX_TILER);
 140                 out = rsrc->bo->gpu + offset;
 141
 142                 /* Check the cache */
 143                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 144                                                            info->start,
 145                                                            info->count,
 146                                                            min_index,
 147                                                            max_index);
 148         } else {
 149                 /* Otherwise, we need to upload to transient memory */
 150                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 151                 struct panfrost_transfer T =
 152                         panfrost_pool_alloc_aligned(&batch->pool,
 153                                 info->count * info->index_size,
 154                                 info->index_size);
 155
 156                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 157                 out = T.gpu;
 158         }
 159
 160         if (needs_indices) {
 161                 /* Fallback */
 162                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 163
 164                 if (!info->has_user_indices)
 165                         panfrost_minmax_cache_add(rsrc->index_cache,
 166                                                   info->start, info->count,
 167                                                   *min_index, *max_index);
 168         }
 169
 170         return out;
 171 }
 172
 173 static unsigned
 174 translate_tex_wrap(enum pipe_tex_wrap w)
 175 {
 176         switch (w) {
 177         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 178         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 179         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 180         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 181         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 182         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 183         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 184         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 185         default: unreachable("Invalid wrap");
 186         }
 187 }
 188
 189 /* The hardware compares in the wrong order order, so we have to flip before
 190  * encoding. Yes, really. */
 191
 192 static enum mali_func
 193 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 194 {
 195         if (!cso->compare_mode)
 196                 return MALI_FUNC_NEVER;
 197
 198         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 199         return panfrost_flip_compare_func(f);
 200 }
 201
 202 static enum mali_mipmap_mode
 203 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 204 {
 205         switch (f) {
 206         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 207         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 208         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 209         default: unreachable("Invalid");
 210         }
 211 }
 212
 213 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 214                                 struct mali_midgard_sampler_packed *hw)
 215 {
 216         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 217                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 218                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 219                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 220                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 221                 cfg.normalized_coordinates = cso->normalized_coords;
 222
 223                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 224
 225                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 226
 227                 /* If necessary, we disable mipmapping in the sampler descriptor by
 228                  * clamping the LOD as tight as possible (from 0 to epsilon,
 229                  * essentially -- remember these are fixed point numbers, so
 230                  * epsilon=1/256) */
 231
 232                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 233                         cfg.minimum_lod + 1 :
 234                         FIXED_16(cso->max_lod, false);
 235
 236                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 237                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 238                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 239
 240                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 241                 cfg.seamless_cube_map = cso->seamless_cube_map;
 242
 243                 cfg.border_color_r = cso->border_color.f[0];
 244                 cfg.border_color_g = cso->border_color.f[1];
 245                 cfg.border_color_b = cso->border_color.f[2];
 246                 cfg.border_color_a = cso->border_color.f[3];
 247         }
 248 }
 249
 250 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 251                                         struct mali_bifrost_sampler_packed *hw)
 252 {
 253         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 254                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 255                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 256                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 257                 cfg.normalized_coordinates = cso->normalized_coords;
 258
 259                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 260                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 261                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 262
 263                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 264                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 265                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 266
 267                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 268                 cfg.seamless_cube_map = cso->seamless_cube_map;
 269         }
 270 }
 271
 272 static bool
 273 panfrost_fs_required(
 274                 struct panfrost_shader_state *fs,
 275                 struct panfrost_blend_final *blend,
 276                 unsigned rt_count)
 277 {
 278         /* If we generally have side effects */
 279         if (fs->fs_sidefx)
 280                 return true;
 281
 282         /* If colour is written we need to execute */
 283         for (unsigned i = 0; i < rt_count; ++i) {
 284                 if (!blend[i].no_colour)
 285                         return true;
 286         }
 287
 288         /* If depth is written and not implied we need to execute.
 289          * TODO: Predicate on Z/S writes being enabled */
 290         return (fs->writes_depth || fs->writes_stencil);
 291 }
 292
 293 static void
 294 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 295                 struct panfrost_blend_final *blend)
 296 {
 297         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 298         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 299         unsigned rt_count = batch->key.nr_cbufs;
 300
 301         struct bifrost_blend_rt *brts = rts;
 302
 303         /* Disable blending for depth-only */
 304
 305         if (rt_count == 0) {
 306                 if (dev->quirks & IS_BIFROST) {
 307                         memset(brts, 0, sizeof(*brts));
 308                         brts[0].unk2 = 0x3;
 309                 } else {
 310                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 311                                 cfg.equation = 0xf0122122; /* Replace */
 312                         }
 313                 }
 314         }
 315
 316         for (unsigned i = 0; i < rt_count; ++i) {
 317                 struct mali_blend_flags_packed flags = {};
 318
 319                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 320                         if (blend[i].no_colour) {
 321                                 cfg.enable = false;
 322                                 break;
 323                         }
 324
 325                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 326
 327                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 328                         cfg.load_destination = blend[i].load_dest;
 329                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 330
 331                         if (!(dev->quirks & IS_BIFROST))
 332                                 cfg.midgard_blend_shader = blend[i].is_shader;
 333                 }
 334
 335                 if (dev->quirks & IS_BIFROST) {
 336                         memset(brts + i, 0, sizeof(brts[i]));
 337                         brts[i].flags = flags.opaque[0];
 338
 339                         if (blend[i].is_shader) {
 340                                 /* The blend shader's address needs to be at
 341                                  * the same top 32 bit as the fragment shader.
 342                                  * TODO: Ensure that's always the case.
 343                                  */
 344                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 345                                        (fs->bo->gpu & (0xffffffffull << 32)));
 346                                 brts[i].shader = blend[i].shader.gpu;
 347                                 brts[i].unk2 = 0x0;
 348                         } else {
 349                                 enum pipe_format format = batch->key.cbufs[i]->format;
 350                                 const struct util_format_description *format_desc;
 351                                 format_desc = util_format_description(format);
 352
 353                                 brts[i].equation = blend[i].equation.equation;
 354
 355                                 /* TODO: this is a bit more complicated */
 356                                 brts[i].constant = blend[i].equation.constant;
 357
 358                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 359
 360                                 /* 0x19 disables blending and forces REPLACE
 361                                  * mode (equivalent to rgb_mode = alpha_mode =
 362                                  * x122, colour mask = 0xF). 0x1a allows
 363                                  * blending. */
 364                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 365
 366                                 brts[i].shader_type = fs->blend_types[i];
 367                         }
 368                 } else {
 369                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 370                                 cfg.flags = flags;
 371
 372                                 if (blend[i].is_shader) {
 373                                         cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 374                                 } else {
 375                                         cfg.equation = blend[i].equation.equation.opaque[0];
 376                                         cfg.constant = blend[i].equation.constant;
 377                                 }
 378                         }
 379
 380                         rts += MALI_MIDGARD_BLEND_LENGTH;
 381                 }
 382         }
 383 }
 384
 385 static void
 386 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 387                                struct mali_state_packed *fragmeta,
 388                                struct panfrost_blend_final *blend)
 389 {
 390         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 391         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 392         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 393         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 394         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 395         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 396
 397         /* Built up here */
 398         struct mali_shader_packed shader = fs->shader;
 399         struct mali_preload_packed preload = fs->preload;
 400         uint32_t properties;
 401         struct mali_multisample_misc_packed multisample_misc;
 402         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 403         union midgard_blend sfbd_blend = { 0 };
 404
 405         if (!panfrost_fs_required(fs, blend, rt_count)) {
 406                 if (dev->quirks & IS_BIFROST) {
 407                         pan_pack(&shader, SHADER, cfg) {}
 408
 409                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 410                                 cfg.unknown = 0x950020; /* XXX */
 411                                 cfg.early_z_enable = true;
 412                         }
 413
 414                         preload.opaque[0] = 0;
 415                 } else {
 416                         pan_pack(&shader, SHADER, cfg) {
 417                                 cfg.shader = 0x1;
 418                         }
 419
 420                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 421                                 cfg.work_register_count = 1;
 422                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 423                                 cfg.early_z_enable = true;
 424                         }
 425                 }
 426         } else if (dev->quirks & IS_BIFROST) {
 427                 bool no_blend = true;
 428
 429                 for (unsigned i = 0; i < rt_count; ++i)
 430                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 431
 432                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 433                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 434                 }
 435
 436                 /* Combine with prepacked properties */
 437                 properties |= fs->properties.opaque[0];
 438         } else {
 439                 /* Reasons to disable early-Z from a shader perspective */
 440                 bool late_z = fs->can_discard || fs->writes_global ||
 441                         fs->writes_depth || fs->writes_stencil;
 442
 443                 /* If either depth or stencil is enabled, discard matters */
 444                 bool zs_enabled =
 445                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 446                         zsa->base.stencil[0].enabled;
 447
 448                 bool has_blend_shader = false;
 449
 450                 for (unsigned c = 0; c < rt_count; ++c)
 451                         has_blend_shader |= blend[c].is_shader;
 452
 453                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 454                         /* TODO: Reduce this limit? */
 455                         if (has_blend_shader)
 456                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 457                         else
 458                                 cfg.work_register_count = fs->work_reg_count;
 459
 460                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 461                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 462                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 463                 }
 464
 465                 properties |= fs->properties.opaque[0];
 466         }
 467
 468         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 469                 bool msaa = rast->multisample;
 470                 cfg.multisample_enable = msaa;
 471                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 472
 473                 /* EXT_shader_framebuffer_fetch requires per-sample */
 474                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 475                 cfg.evaluate_per_sample = msaa && per_sample;
 476
 477                 if (dev->quirks & MIDGARD_SFBD) {
 478                         cfg.sfbd_load_destination = blend[0].load_dest;
 479                         cfg.sfbd_blend_shader = blend[0].is_shader;
 480                 }
 481
 482                 cfg.depth_function = zsa->base.depth.enabled ?
 483                         panfrost_translate_compare_func(zsa->base.depth.func) :
 484                         MALI_FUNC_ALWAYS;
 485
 486                 cfg.depth_write_mask = zsa->base.depth.writemask;
 487                 cfg.near_discard = rast->depth_clip_near;
 488                 cfg.far_discard = rast->depth_clip_far;
 489                 cfg.unknown_2 = true;
 490         }
 491
 492         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 493                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 494                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 495                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 496                 cfg.alpha_to_coverage = alpha_to_coverage;
 497
 498                 if (dev->quirks & MIDGARD_SFBD) {
 499                         cfg.sfbd_write_enable = !blend[0].no_colour;
 500                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 501                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 502                 }
 503
 504                 cfg.unknown_1 = 0x7;
 505                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 506                 cfg.single_sampled_lines = !rast->multisample;
 507         }
 508
 509         if (dev->quirks & MIDGARD_SFBD) {
 510                 if (blend[0].is_shader) {
 511                         sfbd_blend.shader = blend[0].shader.gpu |
 512                                 blend[0].shader.first_tag;
 513                 } else {
 514                         sfbd_blend.equation = blend[0].equation.equation;
 515                         sfbd_blend.constant = blend[0].equation.constant;
 516                 }
 517         } else if (!(dev->quirks & IS_BIFROST)) {
 518                 /* Bug where MRT-capable hw apparently reads the last blend
 519                  * shader from here instead of the usual location? */
 520
 521                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 522                         if (!blend[rt].is_shader)
 523                                 continue;
 524
 525                         sfbd_blend.shader = blend[rt].shader.gpu |
 526                                                  blend[rt].shader.first_tag;
 527                         break;
 528                 }
 529         }
 530
 531         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 532                 cfg.shader = fs->shader;
 533                 cfg.properties = properties;
 534                 cfg.depth_units = rast->offset_units * 2.0f;
 535                 cfg.depth_factor = rast->offset_scale;
 536                 cfg.multisample_misc = multisample_misc;
 537                 cfg.stencil_mask_misc = stencil_mask_misc;
 538
 539                 cfg.stencil_front = zsa->stencil_front;
 540                 cfg.stencil_back = zsa->stencil_back;
 541
 542                 /* Bottom bits for stencil ref, exactly one word */
 543                 bool back_enab = zsa->base.stencil[1].enabled;
 544                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 545                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 546
 547                 if (dev->quirks & IS_BIFROST)
 548                         cfg.preload = preload;
 549                 else
 550                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 551         }
 552 }
 553
 554 mali_ptr
 555 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 556 {
 557         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 558
 559         panfrost_batch_add_bo(batch, ss->bo,
 560                               PAN_BO_ACCESS_PRIVATE |
 561                               PAN_BO_ACCESS_READ |
 562                               PAN_BO_ACCESS_VERTEX_TILER);
 563
 564         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 565                               PAN_BO_ACCESS_PRIVATE |
 566                               PAN_BO_ACCESS_READ |
 567                               PAN_BO_ACCESS_VERTEX_TILER);
 568
 569         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 570 }
 571
 572 mali_ptr
 573 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 574 {
 575         struct panfrost_context *ctx = batch->ctx;
 576         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 577
 578         /* Add the shader BO to the batch. */
 579         panfrost_batch_add_bo(batch, ss->bo,
 580                               PAN_BO_ACCESS_PRIVATE |
 581                               PAN_BO_ACCESS_READ |
 582                               PAN_BO_ACCESS_FRAGMENT);
 583
 584         struct panfrost_device *dev = pan_device(ctx->base.screen);
 585         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 586         struct panfrost_transfer xfer;
 587         unsigned rt_size;
 588
 589         if (dev->quirks & MIDGARD_SFBD)
 590                 rt_size = 0;
 591         else if (dev->quirks & IS_BIFROST)
 592                 rt_size = sizeof(struct bifrost_blend_rt);
 593         else
 594                 rt_size = sizeof(struct midgard_blend_rt);
 595
 596         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 597         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 598
 599         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 600
 601         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 602                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 603
 604         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 605
 606         if (!(dev->quirks & MIDGARD_SFBD))
 607                 panfrost_emit_blend(batch, xfer.cpu + MALI_STATE_LENGTH, blend);
 608         else
 609                 batch->draws |= PIPE_CLEAR_COLOR0;
 610
 611         return xfer.gpu;
 612 }
 613
 614 mali_ptr
 615 panfrost_emit_viewport(struct panfrost_batch *batch)
 616 {
 617         struct panfrost_context *ctx = batch->ctx;
 618         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 619         const struct pipe_scissor_state *ss = &ctx->scissor;
 620         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 621         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 622
 623         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 624          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 625          * translate + |scale|, so the ordering is correct here. */
 626         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 627         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 628         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 629         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 630         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 631         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 632
 633         /* Scissor to the intersection of viewport and to the scissor, clamped
 634          * to the framebuffer */
 635
 636         unsigned minx = MIN2(fb->width, vp_minx);
 637         unsigned maxx = MIN2(fb->width, vp_maxx);
 638         unsigned miny = MIN2(fb->height, vp_miny);
 639         unsigned maxy = MIN2(fb->height, vp_maxy);
 640
 641         if (ss && rast->scissor) {
 642                 minx = MAX2(ss->minx, minx);
 643                 miny = MAX2(ss->miny, miny);
 644                 maxx = MIN2(ss->maxx, maxx);
 645                 maxy = MIN2(ss->maxy, maxy);
 646         }
 647
 648         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 649
 650         pan_pack(T.cpu, VIEWPORT, cfg) {
 651                 cfg.scissor_minimum_x = minx;
 652                 cfg.scissor_minimum_y = miny;
 653                 cfg.scissor_maximum_x = maxx - 1;
 654                 cfg.scissor_maximum_y = maxy - 1;
 655
 656                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 657                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 658         }
 659
 660         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 661         return T.gpu;
 662 }
 663
 664 static mali_ptr
 665 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 666                                  enum pipe_shader_type st,
 667                                  struct panfrost_constant_buffer *buf,
 668                                  unsigned index)
 669 {
 670         struct pipe_constant_buffer *cb = &buf->cb[index];
 671         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 672
 673         if (rsrc) {
 674                 panfrost_batch_add_bo(batch, rsrc->bo,
 675                                       PAN_BO_ACCESS_SHARED |
 676                                       PAN_BO_ACCESS_READ |
 677                                       panfrost_bo_access_for_stage(st));
 678
 679                 /* Alignment gauranteed by
 680                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 681                 return rsrc->bo->gpu + cb->buffer_offset;
 682         } else if (cb->user_buffer) {
 683                 return panfrost_pool_upload_aligned(&batch->pool,
 684                                                  cb->user_buffer +
 685                                                  cb->buffer_offset,
 686                                                  cb->buffer_size, 16);
 687         } else {
 688                 unreachable("No constant buffer");
 689         }
 690 }
 691
 692 struct sysval_uniform {
 693         union {
 694                 float f[4];
 695                 int32_t i[4];
 696                 uint32_t u[4];
 697                 uint64_t du[2];
 698         };
 699 };
 700
 701 static void
 702 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 703                                       struct sysval_uniform *uniform)
 704 {
 705         struct panfrost_context *ctx = batch->ctx;
 706         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 707
 708         uniform->f[0] = vp->scale[0];
 709         uniform->f[1] = vp->scale[1];
 710         uniform->f[2] = vp->scale[2];
 711 }
 712
 713 static void
 714 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 715                                        struct sysval_uniform *uniform)
 716 {
 717         struct panfrost_context *ctx = batch->ctx;
 718         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 719
 720         uniform->f[0] = vp->translate[0];
 721         uniform->f[1] = vp->translate[1];
 722         uniform->f[2] = vp->translate[2];
 723 }
 724
 725 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 726                                        enum pipe_shader_type st,
 727                                        unsigned int sysvalid,
 728                                        struct sysval_uniform *uniform)
 729 {
 730         struct panfrost_context *ctx = batch->ctx;
 731         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 732         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 733         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 734         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 735
 736         assert(dim);
 737         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 738
 739         if (dim > 1)
 740                 uniform->i[1] = u_minify(tex->texture->height0,
 741                                          tex->u.tex.first_level);
 742
 743         if (dim > 2)
 744                 uniform->i[2] = u_minify(tex->texture->depth0,
 745                                          tex->u.tex.first_level);
 746
 747         if (is_array)
 748                 uniform->i[dim] = tex->texture->array_size;
 749 }
 750
 751 static void
 752 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 753                             enum pipe_shader_type st,
 754                             unsigned ssbo_id,
 755                             struct sysval_uniform *uniform)
 756 {
 757         struct panfrost_context *ctx = batch->ctx;
 758
 759         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 760         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 761
 762         /* Compute address */
 763         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 764
 765         panfrost_batch_add_bo(batch, bo,
 766                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 767                               panfrost_bo_access_for_stage(st));
 768
 769         /* Upload address and size as sysval */
 770         uniform->du[0] = bo->gpu + sb.buffer_offset;
 771         uniform->u[2] = sb.buffer_size;
 772 }
 773
 774 static void
 775 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 776                                enum pipe_shader_type st,
 777                                unsigned samp_idx,
 778                                struct sysval_uniform *uniform)
 779 {
 780         struct panfrost_context *ctx = batch->ctx;
 781         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 782
 783         uniform->f[0] = sampl->min_lod;
 784         uniform->f[1] = sampl->max_lod;
 785         uniform->f[2] = sampl->lod_bias;
 786
 787         /* Even without any errata, Midgard represents "no mipmapping" as
 788          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 789          * panfrost_create_sampler_state which also explains our choice of
 790          * epsilon value (again to keep behaviour consistent) */
 791
 792         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 793                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 794 }
 795
 796 static void
 797 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 798                                        struct sysval_uniform *uniform)
 799 {
 800         struct panfrost_context *ctx = batch->ctx;
 801
 802         uniform->u[0] = ctx->compute_grid->grid[0];
 803         uniform->u[1] = ctx->compute_grid->grid[1];
 804         uniform->u[2] = ctx->compute_grid->grid[2];
 805 }
 806
 807 static void
 808 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 809                         struct panfrost_shader_state *ss,
 810                         enum pipe_shader_type st)
 811 {
 812         struct sysval_uniform *uniforms = (void *)buf;
 813
 814         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 815                 int sysval = ss->sysval[i];
 816
 817                 switch (PAN_SYSVAL_TYPE(sysval)) {
 818                 case PAN_SYSVAL_VIEWPORT_SCALE:
 819                         panfrost_upload_viewport_scale_sysval(batch,
 820                                                               &uniforms[i]);
 821                         break;
 822                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 823                         panfrost_upload_viewport_offset_sysval(batch,
 824                                                                &uniforms[i]);
 825                         break;
 826                 case PAN_SYSVAL_TEXTURE_SIZE:
 827                         panfrost_upload_txs_sysval(batch, st,
 828                                                    PAN_SYSVAL_ID(sysval),
 829                                                    &uniforms[i]);
 830                         break;
 831                 case PAN_SYSVAL_SSBO:
 832                         panfrost_upload_ssbo_sysval(batch, st,
 833                                                     PAN_SYSVAL_ID(sysval),
 834                                                     &uniforms[i]);
 835                         break;
 836                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 837                         panfrost_upload_num_work_groups_sysval(batch,
 838                                                                &uniforms[i]);
 839                         break;
 840                 case PAN_SYSVAL_SAMPLER:
 841                         panfrost_upload_sampler_sysval(batch, st,
 842                                                        PAN_SYSVAL_ID(sysval),
 843                                                        &uniforms[i]);
 844                         break;
 845                 default:
 846                         assert(0);
 847                 }
 848         }
 849 }
 850
 851 static const void *
 852 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
 853                                  unsigned index)
 854 {
 855         struct pipe_constant_buffer *cb = &buf->cb[index];
 856         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 857
 858         if (rsrc)
 859                 return rsrc->bo->cpu;
 860         else if (cb->user_buffer)
 861                 return cb->user_buffer;
 862         else
 863                 unreachable("No constant buffer");
 864 }
 865
 866 mali_ptr
 867 panfrost_emit_const_buf(struct panfrost_batch *batch,
 868                         enum pipe_shader_type stage,
 869                         mali_ptr *push_constants)
 870 {
 871         struct panfrost_context *ctx = batch->ctx;
 872         struct panfrost_shader_variants *all = ctx->shader[stage];
 873
 874         if (!all)
 875                 return 0;
 876
 877         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
 878
 879         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
 880
 881         /* Uniforms are implicitly UBO #0 */
 882         bool has_uniforms = buf->enabled_mask & (1 << 0);
 883
 884         /* Allocate room for the sysval and the uniforms */
 885         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
 886         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
 887         size_t size = sys_size + uniform_size;
 888         struct panfrost_transfer transfer =
 889                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
 890
 891         /* Upload sysvals requested by the shader */
 892         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
 893
 894         /* Upload uniforms */
 895         if (has_uniforms && uniform_size) {
 896                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
 897                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
 898         }
 899
 900         /* Next up, attach UBOs. UBO #0 is the uniforms we just
 901          * uploaded, so it's always included. The count is the highest UBO
 902          * addressable -- gaps are included. */
 903
 904         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
 905
 906         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
 907         struct panfrost_transfer ubos =
 908                 panfrost_pool_alloc_aligned(&batch->pool, sz,
 909                                 MALI_UNIFORM_BUFFER_LENGTH);
 910
 911         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
 912
 913         /* Upload uniforms as a UBO */
 914
 915         if (size) {
 916                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
 917                         cfg.entries = DIV_ROUND_UP(size, 16);
 918                         cfg.pointer = transfer.gpu;
 919                 }
 920         } else {
 921                 *ubo_ptr = 0;
 922         }
 923
 924         /* The rest are honest-to-goodness UBOs */
 925
 926         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
 927                 size_t usz = buf->cb[ubo].buffer_size;
 928                 bool enabled = buf->enabled_mask & (1 << ubo);
 929                 bool empty = usz == 0;
 930
 931                 if (!enabled || empty) {
 932                         ubo_ptr[ubo] = 0;
 933                         continue;
 934                 }
 935
 936                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
 937                         cfg.entries = DIV_ROUND_UP(usz, 16);
 938                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
 939                                         stage, buf, ubo);
 940                 }
 941         }
 942
 943         *push_constants = transfer.gpu;
 944
 945         buf->dirty_mask = 0;
 946         return ubos.gpu;
 947 }
 948
 949 mali_ptr
 950 panfrost_emit_shared_memory(struct panfrost_batch *batch,
 951                             const struct pipe_grid_info *info)
 952 {
 953         struct panfrost_context *ctx = batch->ctx;
 954         struct panfrost_device *dev = pan_device(ctx->base.screen);
 955         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
 956         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
 957         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
 958                                                            128));
 959
 960         unsigned log2_instances =
 961                 util_logbase2_ceil(info->grid[0]) +
 962                 util_logbase2_ceil(info->grid[1]) +
 963                 util_logbase2_ceil(info->grid[2]);
 964
 965         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
 966         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
 967                                                                   shared_size,
 968                                                                   1);
 969
 970         struct mali_shared_memory shared = {
 971                 .shared_memory = bo->gpu,
 972                 .shared_workgroup_count = log2_instances,
 973                 .shared_shift = util_logbase2(single_size) + 1
 974         };
 975
 976         return panfrost_pool_upload_aligned(&batch->pool, &shared,
 977                         sizeof(shared), 64);
 978 }
 979
 980 static mali_ptr
 981 panfrost_get_tex_desc(struct panfrost_batch *batch,
 982                       enum pipe_shader_type st,
 983                       struct panfrost_sampler_view *view)
 984 {
 985         if (!view)
 986                 return (mali_ptr) 0;
 987
 988         struct pipe_sampler_view *pview = &view->base;
 989         struct panfrost_resource *rsrc = pan_resource(pview->texture);
 990
 991         /* Add the BO to the job so it's retained until the job is done. */
 992
 993         panfrost_batch_add_bo(batch, rsrc->bo,
 994                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
 995                               panfrost_bo_access_for_stage(st));
 996
 997         panfrost_batch_add_bo(batch, view->bo,
 998                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
 999                               panfrost_bo_access_for_stage(st));
1000
1001         return view->bo->gpu;
1002 }
1003
1004 static void
1005 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1006                              struct pipe_context *pctx)
1007 {
1008         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1009         if (view->texture_bo != rsrc->bo->gpu ||
1010             view->modifier != rsrc->modifier) {
1011                 panfrost_bo_unreference(view->bo);
1012                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1013         }
1014 }
1015
1016 mali_ptr
1017 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1018                                   enum pipe_shader_type stage)
1019 {
1020         struct panfrost_context *ctx = batch->ctx;
1021         struct panfrost_device *device = pan_device(ctx->base.screen);
1022
1023         if (!ctx->sampler_view_count[stage])
1024                 return 0;
1025
1026         if (device->quirks & IS_BIFROST) {
1027                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1028                                 MALI_BIFROST_TEXTURE_LENGTH *
1029                                 ctx->sampler_view_count[stage],
1030                                 MALI_BIFROST_TEXTURE_LENGTH);
1031
1032                 struct mali_bifrost_texture_packed *out =
1033                         (struct mali_bifrost_texture_packed *) T.cpu;
1034
1035                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1036                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1037                         struct pipe_sampler_view *pview = &view->base;
1038                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1039
1040                         panfrost_update_sampler_view(view, &ctx->base);
1041                         out[i] = view->bifrost_descriptor;
1042
1043                         /* Add the BOs to the job so they are retained until the job is done. */
1044
1045                         panfrost_batch_add_bo(batch, rsrc->bo,
1046                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1047                                               panfrost_bo_access_for_stage(stage));
1048
1049                         panfrost_batch_add_bo(batch, view->bo,
1050                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1051                                               panfrost_bo_access_for_stage(stage));
1052                 }
1053
1054                 return T.gpu;
1055         } else {
1056                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1057
1058                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1059                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1060
1061                         panfrost_update_sampler_view(view, &ctx->base);
1062
1063                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1064                 }
1065
1066                 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1067                                 sizeof(uint64_t) *
1068                                 ctx->sampler_view_count[stage],
1069                                 sizeof(uint64_t));
1070         }
1071 }
1072
1073 mali_ptr
1074 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1075                                   enum pipe_shader_type stage)
1076 {
1077         struct panfrost_context *ctx = batch->ctx;
1078
1079         if (!ctx->sampler_count[stage])
1080                 return 0;
1081
1082         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1083         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1084
1085         size_t sz = desc_size * ctx->sampler_count[stage];
1086         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1087         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1088
1089         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1090                 out[i] = ctx->samplers[stage][i]->hw;
1091
1092         return T.gpu;
1093 }
1094
1095 mali_ptr
1096 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1097                           mali_ptr *buffers)
1098 {
1099         struct panfrost_context *ctx = batch->ctx;
1100         struct panfrost_vertex_state *so = ctx->vertex;
1101         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1102
1103         /* Worst case: everything is NPOT, which is only possible if instancing
1104          * is enabled. Otherwise single record is gauranteed */
1105         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1106                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1107                         (ctx->instance_count > 1 ? 2 : 1),
1108                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1109
1110         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1111                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1112                         MALI_ATTRIBUTE_LENGTH);
1113
1114         struct mali_attribute_buffer_packed *bufs =
1115                 (struct mali_attribute_buffer_packed *) S.cpu;
1116
1117         struct mali_attribute_packed *out =
1118                 (struct mali_attribute_packed *) T.cpu;
1119
1120         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1121         unsigned k = 0;
1122
1123         for (unsigned i = 0; i < so->num_elements; ++i) {
1124                 /* We map buffers 1:1 with the attributes, which
1125                  * means duplicating some vertex buffers (who cares? aside from
1126                  * maybe some caching implications but I somehow doubt that
1127                  * matters) */
1128
1129                 struct pipe_vertex_element *elem = &so->pipe[i];
1130                 unsigned vbi = elem->vertex_buffer_index;
1131                 attrib_to_buffer[i] = k;
1132
1133                 if (!(ctx->vb_mask & (1 << vbi)))
1134                         continue;
1135
1136                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1137                 struct panfrost_resource *rsrc;
1138
1139                 rsrc = pan_resource(buf->buffer.resource);
1140                 if (!rsrc)
1141                         continue;
1142
1143                 /* Add a dependency of the batch on the vertex buffer */
1144                 panfrost_batch_add_bo(batch, rsrc->bo,
1145                                       PAN_BO_ACCESS_SHARED |
1146                                       PAN_BO_ACCESS_READ |
1147                                       PAN_BO_ACCESS_VERTEX_TILER);
1148
1149                 /* Mask off lower bits, see offset fixup below */
1150                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1151                 mali_ptr addr = raw_addr & ~63;
1152
1153                 /* Since we advanced the base pointer, we shrink the buffer
1154                  * size, but add the offset we subtracted */
1155                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1156                         - buf->buffer_offset;
1157
1158                 /* When there is a divisor, the hardware-level divisor is
1159                  * the product of the instance divisor and the padded count */
1160                 unsigned divisor = elem->instance_divisor;
1161                 unsigned hw_divisor = ctx->padded_count * divisor;
1162                 unsigned stride = buf->stride;
1163
1164                 /* If there's a divisor(=1) but no instancing, we want every
1165                  * attribute to be the same */
1166
1167                 if (divisor && ctx->instance_count == 1)
1168                         stride = 0;
1169
1170                 if (!divisor || ctx->instance_count <= 1) {
1171                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1172                                 if (ctx->instance_count > 1) {
1173                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1174                                         cfg.divisor = ctx->padded_count;
1175                                 }
1176
1177                                 cfg.pointer = addr;
1178                                 cfg.stride = stride;
1179                                 cfg.size = size;
1180                         }
1181                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1182                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1183                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1184                                 cfg.pointer = addr;
1185                                 cfg.stride = stride;
1186                                 cfg.size = size;
1187                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1188                         }
1189
1190                 } else {
1191                         unsigned shift = 0, extra_flags = 0;
1192
1193                         unsigned magic_divisor =
1194                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1195
1196                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1197                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1198                                 cfg.pointer = addr;
1199                                 cfg.stride = stride;
1200                                 cfg.size = size;
1201
1202                                 cfg.divisor_r = shift;
1203                                 cfg.divisor_e = extra_flags;
1204                         }
1205
1206                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1207                                 cfg.divisor_numerator = magic_divisor;
1208                                 cfg.divisor = divisor;
1209                         }
1210
1211                         ++k;
1212                 }
1213
1214                 ++k;
1215         }
1216
1217         /* Add special gl_VertexID/gl_InstanceID buffers */
1218
1219         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1220                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1221
1222                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1223                         cfg.buffer_index = k++;
1224                         cfg.format = so->formats[PAN_VERTEX_ID];
1225                 }
1226
1227                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1228
1229                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1230                         cfg.buffer_index = k++;
1231                         cfg.format = so->formats[PAN_INSTANCE_ID];
1232                 }
1233         }
1234
1235         /* Attribute addresses require 64-byte alignment, so let:
1236          *
1237          *      base' = base & ~63 = base - (base & 63)
1238          *      offset' = offset + (base & 63)
1239          *
1240          * Since base' + offset' = base + offset, these are equivalent
1241          * addressing modes and now base is 64 aligned.
1242          */
1243
1244         for (unsigned i = 0; i < so->num_elements; ++i) {
1245                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1246                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1247
1248                 /* Adjust by the masked off bits of the offset. Make sure we
1249                  * read src_offset from so->hw (which is not GPU visible)
1250                  * rather than target (which is) due to caching effects */
1251
1252                 unsigned src_offset = so->pipe[i].src_offset;
1253
1254                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1255                 src_offset += (buf->buffer_offset & 63);
1256
1257                 /* Also, somewhat obscurely per-instance data needs to be
1258                  * offset in response to a delayed start in an indexed draw */
1259
1260                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1261                         src_offset -= buf->stride * ctx->offset_start;
1262
1263                 pan_pack(out + i, ATTRIBUTE, cfg) {
1264                         cfg.buffer_index = attrib_to_buffer[i];
1265                         cfg.format = so->formats[i];
1266                         cfg.offset = src_offset;
1267                 }
1268         }
1269
1270         *buffers = S.gpu;
1271         return T.gpu;
1272 }
1273
1274 static mali_ptr
1275 panfrost_emit_varyings(struct panfrost_batch *batch,
1276                 struct mali_attribute_buffer_packed *slot,
1277                 unsigned stride, unsigned count)
1278 {
1279         unsigned size = stride * count;
1280         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1281
1282         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1283                 cfg.stride = stride;
1284                 cfg.size = size;
1285                 cfg.pointer = ptr;
1286         }
1287
1288         return ptr;
1289 }
1290
1291 static unsigned
1292 panfrost_streamout_offset(unsigned stride, unsigned offset,
1293                         struct pipe_stream_output_target *target)
1294 {
1295         return (target->buffer_offset + (offset * stride * 4)) & 63;
1296 }
1297
1298 static void
1299 panfrost_emit_streamout(struct panfrost_batch *batch,
1300                         struct mali_attribute_buffer_packed *slot,
1301                         unsigned stride_words, unsigned offset, unsigned count,
1302                         struct pipe_stream_output_target *target)
1303 {
1304         unsigned stride = stride_words * 4;
1305         unsigned max_size = target->buffer_size;
1306         unsigned expected_size = stride * count;
1307
1308         /* Grab the BO and bind it to the batch */
1309         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1310
1311         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1312          * the perspective of the TILER and FRAGMENT.
1313          */
1314         panfrost_batch_add_bo(batch, bo,
1315                               PAN_BO_ACCESS_SHARED |
1316                               PAN_BO_ACCESS_RW |
1317                               PAN_BO_ACCESS_VERTEX_TILER |
1318                               PAN_BO_ACCESS_FRAGMENT);
1319
1320         /* We will have an offset applied to get alignment */
1321         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1322
1323         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1324                 cfg.pointer = (addr & ~63);
1325                 cfg.stride = stride;
1326                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1327         }
1328 }
1329
1330 /* Helpers for manipulating stream out information so we can pack varyings
1331  * accordingly. Compute the src_offset for a given captured varying */
1332
1333 static struct pipe_stream_output *
1334 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1335 {
1336         for (unsigned i = 0; i < info->num_outputs; ++i) {
1337                 if (info->output[i].register_index == loc)
1338                         return &info->output[i];
1339         }
1340
1341         unreachable("Varying not captured");
1342 }
1343
1344 static unsigned
1345 pan_varying_size(enum mali_format fmt)
1346 {
1347         unsigned type = MALI_EXTRACT_TYPE(fmt);
1348         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1349         unsigned bits = MALI_EXTRACT_BITS(fmt);
1350         unsigned bpc = 0;
1351
1352         if (bits == MALI_CHANNEL_FLOAT) {
1353                 /* No doubles */
1354                 bool fp16 = (type == MALI_FORMAT_SINT);
1355                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1356
1357                 bpc = fp16 ? 2 : 4;
1358         } else {
1359                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1360
1361                 /* See the enums */
1362                 bits = 1 << bits;
1363                 assert(bits >= 8);
1364                 bpc = bits / 8;
1365         }
1366
1367         return bpc * chan;
1368 }
1369
1370 /* Indices for named (non-XFB) varyings that are present. These are packed
1371  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1372  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1373  * of a given special field given a shift S by:
1374  *
1375  *      idx = popcount(P & ((1 << S) - 1))
1376  *
1377  * That is... look at all of the varyings that come earlier and count them, the
1378  * count is the new index since plus one. Likewise, the total number of special
1379  * buffers required is simply popcount(P)
1380  */
1381
1382 enum pan_special_varying {
1383         PAN_VARY_GENERAL = 0,
1384         PAN_VARY_POSITION = 1,
1385         PAN_VARY_PSIZ = 2,
1386         PAN_VARY_PNTCOORD = 3,
1387         PAN_VARY_FACE = 4,
1388         PAN_VARY_FRAGCOORD = 5,
1389
1390         /* Keep last */
1391         PAN_VARY_MAX,
1392 };
1393
1394 /* Given a varying, figure out which index it correpsonds to */
1395
1396 static inline unsigned
1397 pan_varying_index(unsigned present, enum pan_special_varying v)
1398 {
1399         unsigned mask = (1 << v) - 1;
1400         return util_bitcount(present & mask);
1401 }
1402
1403 /* Get the base offset for XFB buffers, which by convention come after
1404  * everything else. Wrapper function for semantic reasons; by construction this
1405  * is just popcount. */
1406
1407 static inline unsigned
1408 pan_xfb_base(unsigned present)
1409 {
1410         return util_bitcount(present);
1411 }
1412
1413 /* Computes the present mask for varyings so we can start emitting varying records */
1414
1415 static inline unsigned
1416 pan_varying_present(
1417         struct panfrost_shader_state *vs,
1418         struct panfrost_shader_state *fs,
1419         unsigned quirks,
1420         uint16_t point_coord_mask)
1421 {
1422         /* At the moment we always emit general and position buffers. Not
1423          * strictly necessary but usually harmless */
1424
1425         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1426
1427         /* Enable special buffers by the shader info */
1428
1429         if (vs->writes_point_size)
1430                 present |= (1 << PAN_VARY_PSIZ);
1431
1432         if (fs->reads_point_coord)
1433                 present |= (1 << PAN_VARY_PNTCOORD);
1434
1435         if (fs->reads_face)
1436                 present |= (1 << PAN_VARY_FACE);
1437
1438         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1439                 present |= (1 << PAN_VARY_FRAGCOORD);
1440
1441         /* Also, if we have a point sprite, we need a point coord buffer */
1442
1443         for (unsigned i = 0; i < fs->varying_count; i++)  {
1444                 gl_varying_slot loc = fs->varyings_loc[i];
1445
1446                 if (util_varying_is_point_coord(loc, point_coord_mask))
1447                         present |= (1 << PAN_VARY_PNTCOORD);
1448         }
1449
1450         return present;
1451 }
1452
1453 /* Emitters for varying records */
1454
1455 static void
1456 pan_emit_vary(struct mali_attribute_packed *out,
1457                 unsigned present, enum pan_special_varying buf,
1458                 unsigned quirks, enum mali_format format,
1459                 unsigned offset)
1460 {
1461         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1462         unsigned swizzle = quirks & HAS_SWIZZLES ?
1463                         panfrost_get_default_swizzle(nr_channels) :
1464                         panfrost_bifrost_swizzle(nr_channels);
1465
1466         pan_pack(out, ATTRIBUTE, cfg) {
1467                 cfg.buffer_index = pan_varying_index(present, buf);
1468                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1469                 cfg.format = (format << 12) | swizzle;
1470                 cfg.offset = offset;
1471         }
1472 }
1473
1474 /* General varying that is unused */
1475
1476 static void
1477 pan_emit_vary_only(struct mali_attribute_packed *out,
1478                 unsigned present, unsigned quirks)
1479 {
1480         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1481 }
1482
1483 /* Special records */
1484
1485 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1486         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1487         [PAN_VARY_PSIZ]         = MALI_R16F,
1488         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1489         [PAN_VARY_FACE]         = MALI_R32I,
1490         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1491 };
1492
1493 static void
1494 pan_emit_vary_special(struct mali_attribute_packed *out,
1495                 unsigned present, enum pan_special_varying buf,
1496                 unsigned quirks)
1497 {
1498         assert(buf < PAN_VARY_MAX);
1499         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1500 }
1501
1502 static enum mali_format
1503 pan_xfb_format(enum mali_format format, unsigned nr)
1504 {
1505         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1506                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1507         else
1508                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1509 }
1510
1511 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1512  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1513  * value. */
1514
1515 static void
1516 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1517                 unsigned present,
1518                 unsigned max_xfb,
1519                 unsigned *streamout_offsets,
1520                 unsigned quirks,
1521                 enum mali_format format,
1522                 struct pipe_stream_output o)
1523 {
1524         unsigned swizzle = quirks & HAS_SWIZZLES ?
1525                         panfrost_get_default_swizzle(o.num_components) :
1526                         panfrost_bifrost_swizzle(o.num_components);
1527
1528         pan_pack(out, ATTRIBUTE, cfg) {
1529                 /* XFB buffers come after everything else */
1530                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1531                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1532
1533                 /* Override number of channels and precision to highp */
1534                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1535
1536                 /* Apply given offsets together */
1537                 cfg.offset = (o.dst_offset * 4) /* dwords */
1538                         + streamout_offsets[o.output_buffer];
1539         }
1540 }
1541
1542 /* Determine if we should capture a varying for XFB. This requires actually
1543  * having a buffer for it. If we don't capture it, we'll fallback to a general
1544  * varying path (linked or unlinked, possibly discarding the write) */
1545
1546 static bool
1547 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1548                 unsigned loc, unsigned max_xfb)
1549 {
1550         if (!(xfb->so_mask & (1ll << loc)))
1551                 return false;
1552
1553         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1554         return o->output_buffer < max_xfb;
1555 }
1556
1557 static void
1558 pan_emit_general_varying(struct mali_attribute_packed *out,
1559                 struct panfrost_shader_state *other,
1560                 struct panfrost_shader_state *xfb,
1561                 gl_varying_slot loc,
1562                 enum mali_format format,
1563                 unsigned present,
1564                 unsigned quirks,
1565                 unsigned *gen_offsets,
1566                 enum mali_format *gen_formats,
1567                 unsigned *gen_stride,
1568                 unsigned idx,
1569                 bool should_alloc)
1570 {
1571         /* Check if we're linked */
1572         signed other_idx = -1;
1573
1574         for (unsigned j = 0; j < other->varying_count; ++j) {
1575                 if (other->varyings_loc[j] == loc) {
1576                         other_idx = j;
1577                         break;
1578                 }
1579         }
1580
1581         if (other_idx < 0) {
1582                 pan_emit_vary_only(out, present, quirks);
1583                 return;
1584         }
1585
1586         unsigned offset = gen_offsets[other_idx];
1587
1588         if (should_alloc) {
1589                 /* We're linked, so allocate a space via a watermark allocation */
1590                 enum mali_format alt = other->varyings[other_idx];
1591
1592                 /* Do interpolation at minimum precision */
1593                 unsigned size_main = pan_varying_size(format);
1594                 unsigned size_alt = pan_varying_size(alt);
1595                 unsigned size = MIN2(size_main, size_alt);
1596
1597                 /* If a varying is marked for XFB but not actually captured, we
1598                  * should match the format to the format that would otherwise
1599                  * be used for XFB, since dEQP checks for invariance here. It's
1600                  * unclear if this is required by the spec. */
1601
1602                 if (xfb->so_mask & (1ull << loc)) {
1603                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1604                         format = pan_xfb_format(format, o->num_components);
1605                         size = pan_varying_size(format);
1606                 } else if (size == size_alt) {
1607                         format = alt;
1608                 }
1609
1610                 gen_offsets[idx] = *gen_stride;
1611                 gen_formats[other_idx] = format;
1612                 offset = *gen_stride;
1613                 *gen_stride += size;
1614         }
1615
1616         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1617 }
1618
1619 /* Higher-level wrapper around all of the above, classifying a varying into one
1620  * of the above types */
1621
1622 static void
1623 panfrost_emit_varying(
1624                 struct mali_attribute_packed *out,
1625                 struct panfrost_shader_state *stage,
1626                 struct panfrost_shader_state *other,
1627                 struct panfrost_shader_state *xfb,
1628                 unsigned present,
1629                 uint16_t point_sprite_mask,
1630                 unsigned max_xfb,
1631                 unsigned *streamout_offsets,
1632                 unsigned quirks,
1633                 unsigned *gen_offsets,
1634                 enum mali_format *gen_formats,
1635                 unsigned *gen_stride,
1636                 unsigned idx,
1637                 bool should_alloc,
1638                 bool is_fragment)
1639 {
1640         gl_varying_slot loc = stage->varyings_loc[idx];
1641         enum mali_format format = stage->varyings[idx];
1642
1643         /* Override format to match linkage */
1644         if (!should_alloc && gen_formats[idx])
1645                 format = gen_formats[idx];
1646
1647         if (util_varying_is_point_coord(loc, point_sprite_mask)) {
1648                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1649         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1650                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1651                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1652         } else if (loc == VARYING_SLOT_POS) {
1653                 if (is_fragment)
1654                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1655                 else
1656                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1657         } else if (loc == VARYING_SLOT_PSIZ) {
1658                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1659         } else if (loc == VARYING_SLOT_PNTC) {
1660                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1661         } else if (loc == VARYING_SLOT_FACE) {
1662                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1663         } else {
1664                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1665                                 quirks, gen_offsets, gen_formats, gen_stride,
1666                                 idx, should_alloc);
1667         }
1668 }
1669
1670 static void
1671 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1672                 unsigned present,
1673                 enum pan_special_varying v,
1674                 unsigned special)
1675 {
1676         if (present & (1 << v)) {
1677                 unsigned idx = pan_varying_index(present, v);
1678
1679                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1680                         cfg.special = special;
1681                         cfg.type = 0;
1682                 }
1683         }
1684 }
1685
1686 void
1687 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1688                                  unsigned vertex_count,
1689                                  mali_ptr *vs_attribs,
1690                                  mali_ptr *fs_attribs,
1691                                  mali_ptr *buffers,
1692                                  mali_ptr *position,
1693                                  mali_ptr *psiz)
1694 {
1695         /* Load the shaders */
1696         struct panfrost_context *ctx = batch->ctx;
1697         struct panfrost_device *dev = pan_device(ctx->base.screen);
1698         struct panfrost_shader_state *vs, *fs;
1699         size_t vs_size, fs_size;
1700
1701         /* Allocate the varying descriptor */
1702
1703         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1704         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1705         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1706         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1707
1708         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1709                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1710
1711         struct pipe_stream_output_info *so = &vs->stream_output;
1712         uint16_t point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
1713         unsigned present = pan_varying_present(vs, fs, dev->quirks, point_coord_mask);
1714
1715         /* Check if this varying is linked by us. This is the case for
1716          * general-purpose, non-captured varyings. If it is, link it. If it's
1717          * not, use the provided stream out information to determine the
1718          * offset, since it was already linked for us. */
1719
1720         unsigned gen_offsets[32];
1721         enum mali_format gen_formats[32];
1722         memset(gen_offsets, 0, sizeof(gen_offsets));
1723         memset(gen_formats, 0, sizeof(gen_formats));
1724
1725         unsigned gen_stride = 0;
1726         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1727         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1728
1729         unsigned streamout_offsets[32];
1730
1731         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1732                 streamout_offsets[i] = panfrost_streamout_offset(
1733                                         so->stride[i],
1734                                         ctx->streamout.offsets[i],
1735                                         ctx->streamout.targets[i]);
1736         }
1737
1738         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1739         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1740
1741         for (unsigned i = 0; i < vs->varying_count; i++) {
1742                 panfrost_emit_varying(ovs + i, vs, fs, vs, present, 0,
1743                                 ctx->streamout.num_targets, streamout_offsets,
1744                                 dev->quirks,
1745                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1746         }
1747
1748         for (unsigned i = 0; i < fs->varying_count; i++) {
1749                 panfrost_emit_varying(ofs + i, fs, vs, vs, present, point_coord_mask,
1750                                 ctx->streamout.num_targets, streamout_offsets,
1751                                 dev->quirks,
1752                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1753         }
1754
1755         unsigned xfb_base = pan_xfb_base(present);
1756         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1757                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1758                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1759         struct mali_attribute_buffer_packed *varyings =
1760                 (struct mali_attribute_buffer_packed *) T.cpu;
1761
1762         /* Emit the stream out buffers */
1763
1764         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1765                                                            ctx->vertex_count);
1766
1767         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1768                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1769                                         so->stride[i],
1770                                         ctx->streamout.offsets[i],
1771                                         out_count,
1772                                         ctx->streamout.targets[i]);
1773         }
1774
1775         panfrost_emit_varyings(batch,
1776                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1777                         gen_stride, vertex_count);
1778
1779         /* fp32 vec4 gl_Position */
1780         *position = panfrost_emit_varyings(batch,
1781                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1782                         sizeof(float) * 4, vertex_count);
1783
1784         if (present & (1 << PAN_VARY_PSIZ)) {
1785                 *psiz = panfrost_emit_varyings(batch,
1786                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1787                                 2, vertex_count);
1788         }
1789
1790         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1791         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1792         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1793
1794         *buffers = T.gpu;
1795         *vs_attribs = trans.gpu;
1796         *fs_attribs = trans.gpu + vs_size;
1797 }
1798
1799 void
1800 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1801                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1802                                 struct mali_draw_packed *vertex_draw,
1803                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1804                                 struct mali_draw_packed *tiler_draw,
1805                                 union midgard_primitive_size *primitive_size)
1806 {
1807         struct panfrost_context *ctx = batch->ctx;
1808         struct panfrost_device *device = pan_device(ctx->base.screen);
1809         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1810         struct bifrost_payload_vertex bifrost_vertex = {0,};
1811         struct bifrost_payload_tiler bifrost_tiler = {0,};
1812         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1813         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1814         void *vp, *tp;
1815         size_t vp_size, tp_size;
1816
1817         if (device->quirks & IS_BIFROST) {
1818                 bifrost_vertex.prefix = *vertex_prefix;
1819                 memcpy(&bifrost_vertex.postfix, vertex_draw, MALI_DRAW_LENGTH);
1820                 vp = &bifrost_vertex;
1821                 vp_size = sizeof(bifrost_vertex);
1822
1823                 bifrost_tiler.prefix = *tiler_prefix;
1824                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1825                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1826                 memcpy(&bifrost_tiler.postfix, tiler_draw, MALI_DRAW_LENGTH);
1827                 tp = &bifrost_tiler;
1828                 tp_size = sizeof(bifrost_tiler);
1829         } else {
1830                 midgard_vertex.prefix = *vertex_prefix;
1831                 memcpy(&midgard_vertex.postfix, vertex_draw, MALI_DRAW_LENGTH);
1832                 vp = &midgard_vertex;
1833                 vp_size = sizeof(midgard_vertex);
1834
1835                 midgard_tiler.prefix = *tiler_prefix;
1836                 memcpy(&midgard_tiler.postfix, tiler_draw, MALI_DRAW_LENGTH);
1837                 midgard_tiler.primitive_size = *primitive_size;
1838                 tp = &midgard_tiler;
1839                 tp_size = sizeof(midgard_tiler);
1840         }
1841
1842         if (wallpapering) {
1843                 /* Inject in reverse order, with "predicted" job indices.
1844                  * THIS IS A HACK XXX */
1845                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
1846                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
1847                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1848                                  vp, vp_size, true);
1849                 return;
1850         }
1851
1852         /* If rasterizer discard is enable, only submit the vertex */
1853
1854         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1855                                            vp, vp_size, false);
1856
1857         if (ctx->rasterizer->base.rasterizer_discard)
1858                 return;
1859
1860         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
1861                          false);
1862 }
1863
1864 /* TODO: stop hardcoding this */
1865 mali_ptr
1866 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1867 {
1868         uint16_t locations[] = {
1869             128, 128,
1870             0, 256,
1871             0, 256,
1872             0, 256,
1873             0, 256,
1874             0, 256,
1875             0, 256,
1876             0, 256,
1877             0, 256,
1878             0, 256,
1879             0, 256,
1880             0, 256,
1881             0, 256,
1882             0, 256,
1883             0, 256,
1884             0, 256,
1885             0, 256,
1886             0, 256,
1887             0, 256,
1888             0, 256,
1889             0, 256,
1890             0, 256,
1891             0, 256,
1892             0, 256,
1893             0, 256,
1894             0, 256,
1895             0, 256,
1896             0, 256,
1897             0, 256,
1898             0, 256,
1899             0, 256,
1900             0, 256,
1901             128, 128,
1902             0, 0,
1903             0, 0,
1904             0, 0,
1905             0, 0,
1906             0, 0,
1907             0, 0,
1908             0, 0,
1909             0, 0,
1910             0, 0,
1911             0, 0,
1912             0, 0,
1913             0, 0,
1914             0, 0,
1915             0, 0,
1916             0, 0,
1917         };
1918
1919         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
1920 }