src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28 #include "util/u_helpers.h"
  29
  30 #include "panfrost-quirks.h"
  31
  32 #include "pan_pool.h"
  33 #include "pan_bo.h"
  34 #include "pan_cmdstream.h"
  35 #include "pan_context.h"
  36 #include "pan_job.h"
  37
  38 /* If a BO is accessed for a particular shader stage, will it be in the primary
  39  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  40  * fragment will be primary, e.g. compute jobs will be considered
  41  * "vertex/tiler" by analogy */
  42
  43 static inline uint32_t
  44 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  45 {
  46         assert(stage == PIPE_SHADER_FRAGMENT ||
  47                stage == PIPE_SHADER_VERTEX ||
  48                stage == PIPE_SHADER_COMPUTE);
  49
  50         return stage == PIPE_SHADER_FRAGMENT ?
  51                PAN_BO_ACCESS_FRAGMENT :
  52                PAN_BO_ACCESS_VERTEX_TILER;
  53 }
  54
  55 mali_ptr
  56 panfrost_vt_emit_shared_memory(struct panfrost_batch *batch)
  57 {
  58         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
  59
  60         struct mali_shared_memory shared = {
  61                 .shared_workgroup_count = ~0,
  62         };
  63
  64         if (batch->stack_size) {
  65                 struct panfrost_bo *stack =
  66                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  67                                         dev->thread_tls_alloc,
  68                                         dev->core_count);
  69
  70                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  71                 shared.scratchpad = stack->gpu;
  72         }
  73
  74         return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  75 }
  76
  77 void
  78 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  79                                   bool points,
  80                                   union midgard_primitive_size *primitive_size)
  81 {
  82         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
  83
  84         if (!panfrost_writes_point_size(ctx)) {
  85                 float val = points ?
  86                               rasterizer->base.point_size :
  87                               rasterizer->base.line_width;
  88
  89                 primitive_size->constant = val;
  90         }
  91 }
  92
  93 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
  94  * good for the duration of the draw (transient), could last longer. Also get
  95  * the bounds on the index buffer for the range accessed by the draw. We do
  96  * these operations together because there are natural optimizations which
  97  * require them to be together. */
  98
  99 mali_ptr
 100 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 101                                   const struct pipe_draw_info *info,
 102                                   unsigned *min_index, unsigned *max_index)
 103 {
 104         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 105         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 106         off_t offset = info->start * info->index_size;
 107         bool needs_indices = true;
 108         mali_ptr out = 0;
 109
 110         if (info->max_index != ~0u) {
 111                 *min_index = info->min_index;
 112                 *max_index = info->max_index;
 113                 needs_indices = false;
 114         }
 115
 116         if (!info->has_user_indices) {
 117                 /* Only resources can be directly mapped */
 118                 panfrost_batch_add_bo(batch, rsrc->bo,
 119                                       PAN_BO_ACCESS_SHARED |
 120                                       PAN_BO_ACCESS_READ |
 121                                       PAN_BO_ACCESS_VERTEX_TILER);
 122                 out = rsrc->bo->gpu + offset;
 123
 124                 /* Check the cache */
 125                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 126                                                            info->start,
 127                                                            info->count,
 128                                                            min_index,
 129                                                            max_index);
 130         } else {
 131                 /* Otherwise, we need to upload to transient memory */
 132                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 133                 struct panfrost_transfer T =
 134                         panfrost_pool_alloc_aligned(&batch->pool,
 135                                 info->count * info->index_size,
 136                                 info->index_size);
 137
 138                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 139                 out = T.gpu;
 140         }
 141
 142         if (needs_indices) {
 143                 /* Fallback */
 144                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 145
 146                 if (!info->has_user_indices)
 147                         panfrost_minmax_cache_add(rsrc->index_cache,
 148                                                   info->start, info->count,
 149                                                   *min_index, *max_index);
 150         }
 151
 152         return out;
 153 }
 154
 155 static unsigned
 156 translate_tex_wrap(enum pipe_tex_wrap w)
 157 {
 158         switch (w) {
 159         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 160         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 161         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 162         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 163         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 164         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 165         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 166         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 167         default: unreachable("Invalid wrap");
 168         }
 169 }
 170
 171 /* The hardware compares in the wrong order order, so we have to flip before
 172  * encoding. Yes, really. */
 173
 174 static enum mali_func
 175 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 176 {
 177         if (!cso->compare_mode)
 178                 return MALI_FUNC_NEVER;
 179
 180         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 181         return panfrost_flip_compare_func(f);
 182 }
 183
 184 static enum mali_mipmap_mode
 185 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 186 {
 187         switch (f) {
 188         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 189         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 190         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 191         default: unreachable("Invalid");
 192         }
 193 }
 194
 195 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 196                                 struct mali_midgard_sampler_packed *hw)
 197 {
 198         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 199                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 200                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 201                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 202                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 203                 cfg.normalized_coordinates = cso->normalized_coords;
 204
 205                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 206
 207                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 208
 209                 /* If necessary, we disable mipmapping in the sampler descriptor by
 210                  * clamping the LOD as tight as possible (from 0 to epsilon,
 211                  * essentially -- remember these are fixed point numbers, so
 212                  * epsilon=1/256) */
 213
 214                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 215                         cfg.minimum_lod + 1 :
 216                         FIXED_16(cso->max_lod, false);
 217
 218                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 219                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 220                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 221
 222                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 223                 cfg.seamless_cube_map = cso->seamless_cube_map;
 224
 225                 cfg.border_color_r = cso->border_color.f[0];
 226                 cfg.border_color_g = cso->border_color.f[1];
 227                 cfg.border_color_b = cso->border_color.f[2];
 228                 cfg.border_color_a = cso->border_color.f[3];
 229         }
 230 }
 231
 232 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 233                                         struct mali_bifrost_sampler_packed *hw)
 234 {
 235         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 236                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 237                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 238                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 239                 cfg.normalized_coordinates = cso->normalized_coords;
 240
 241                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 242                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 243                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 244
 245                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 246                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 247                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 248
 249                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 250                 cfg.seamless_cube_map = cso->seamless_cube_map;
 251         }
 252 }
 253
 254 static bool
 255 panfrost_fs_required(
 256                 struct panfrost_shader_state *fs,
 257                 struct panfrost_blend_final *blend,
 258                 unsigned rt_count)
 259 {
 260         /* If we generally have side effects */
 261         if (fs->fs_sidefx)
 262                 return true;
 263
 264         /* If colour is written we need to execute */
 265         for (unsigned i = 0; i < rt_count; ++i) {
 266                 if (!blend[i].no_colour)
 267                         return true;
 268         }
 269
 270         /* If depth is written and not implied we need to execute.
 271          * TODO: Predicate on Z/S writes being enabled */
 272         return (fs->writes_depth || fs->writes_stencil);
 273 }
 274
 275 static void
 276 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 277                 struct panfrost_blend_final *blend)
 278 {
 279         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 280         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 281         unsigned rt_count = batch->key.nr_cbufs;
 282
 283         struct bifrost_blend_rt *brts = rts;
 284
 285         /* Disable blending for depth-only */
 286
 287         if (rt_count == 0) {
 288                 if (dev->quirks & IS_BIFROST) {
 289                         memset(brts, 0, sizeof(*brts));
 290                         brts[0].unk2 = 0x3;
 291                 } else {
 292                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 293                                 cfg.equation = 0xf0122122; /* Replace */
 294                         }
 295                 }
 296         }
 297
 298         for (unsigned i = 0; i < rt_count; ++i) {
 299                 struct mali_blend_flags_packed flags = {0};
 300
 301                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 302                         if (blend[i].no_colour) {
 303                                 cfg.enable = false;
 304                                 break;
 305                         }
 306
 307                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 308
 309                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 310                         cfg.load_destination = blend[i].load_dest;
 311                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 312
 313                         if (!(dev->quirks & IS_BIFROST))
 314                                 cfg.midgard_blend_shader = blend[i].is_shader;
 315                 }
 316
 317                 if (dev->quirks & IS_BIFROST) {
 318                         memset(brts + i, 0, sizeof(brts[i]));
 319                         brts[i].flags = flags.opaque[0];
 320
 321                         if (blend[i].is_shader) {
 322                                 /* The blend shader's address needs to be at
 323                                  * the same top 32 bit as the fragment shader.
 324                                  * TODO: Ensure that's always the case.
 325                                  */
 326                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 327                                        (fs->bo->gpu & (0xffffffffull << 32)));
 328                                 brts[i].shader = blend[i].shader.gpu;
 329                                 brts[i].unk2 = 0x0;
 330                         } else {
 331                                 enum pipe_format format = batch->key.cbufs[i]->format;
 332                                 const struct util_format_description *format_desc;
 333                                 format_desc = util_format_description(format);
 334
 335                                 brts[i].equation = blend[i].equation.equation;
 336
 337                                 /* TODO: this is a bit more complicated */
 338                                 brts[i].constant = blend[i].equation.constant;
 339
 340                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 341
 342                                 /* 0x19 disables blending and forces REPLACE
 343                                  * mode (equivalent to rgb_mode = alpha_mode =
 344                                  * x122, colour mask = 0xF). 0x1a allows
 345                                  * blending. */
 346                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 347
 348                                 brts[i].shader_type = fs->blend_types[i];
 349                         }
 350                 } else {
 351                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 352                                 cfg.flags = flags;
 353
 354                                 if (blend[i].is_shader) {
 355                                         cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 356                                 } else {
 357                                         cfg.equation = blend[i].equation.equation.opaque[0];
 358                                         cfg.constant = blend[i].equation.constant;
 359                                 }
 360                         }
 361
 362                         rts += MALI_MIDGARD_BLEND_LENGTH;
 363                 }
 364         }
 365 }
 366
 367 static void
 368 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 369                                struct mali_state_packed *fragmeta,
 370                                struct panfrost_blend_final *blend)
 371 {
 372         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 373         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 374         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 375         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 376         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 377         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 378
 379         /* Built up here */
 380         struct mali_shader_packed shader = fs->shader;
 381         struct mali_preload_packed preload = fs->preload;
 382         uint32_t properties;
 383         struct mali_multisample_misc_packed multisample_misc;
 384         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 385         union midgard_blend sfbd_blend = { 0 };
 386
 387         if (!panfrost_fs_required(fs, blend, rt_count)) {
 388                 if (dev->quirks & IS_BIFROST) {
 389                         pan_pack(&shader, SHADER, cfg) {}
 390
 391                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 392                                 cfg.unknown = 0x950020; /* XXX */
 393                                 cfg.early_z_enable = true;
 394                         }
 395
 396                         preload.opaque[0] = 0;
 397                 } else {
 398                         pan_pack(&shader, SHADER, cfg) {
 399                                 cfg.shader = 0x1;
 400                         }
 401
 402                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 403                                 cfg.work_register_count = 1;
 404                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 405                                 cfg.early_z_enable = true;
 406                         }
 407                 }
 408         } else if (dev->quirks & IS_BIFROST) {
 409                 bool no_blend = true;
 410
 411                 for (unsigned i = 0; i < rt_count; ++i)
 412                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 413
 414                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 415                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 416                 }
 417
 418                 /* Combine with prepacked properties */
 419                 properties |= fs->properties.opaque[0];
 420         } else {
 421                 /* Reasons to disable early-Z from a shader perspective */
 422                 bool late_z = fs->can_discard || fs->writes_global ||
 423                         fs->writes_depth || fs->writes_stencil;
 424
 425                 /* If either depth or stencil is enabled, discard matters */
 426                 bool zs_enabled =
 427                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 428                         zsa->base.stencil[0].enabled;
 429
 430                 bool has_blend_shader = false;
 431
 432                 for (unsigned c = 0; c < rt_count; ++c)
 433                         has_blend_shader |= blend[c].is_shader;
 434
 435                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 436                         /* TODO: Reduce this limit? */
 437                         if (has_blend_shader)
 438                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 439                         else
 440                                 cfg.work_register_count = fs->work_reg_count;
 441
 442                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 443                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 444                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 445                 }
 446
 447                 properties |= fs->properties.opaque[0];
 448         }
 449
 450         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 451                 bool msaa = rast->multisample;
 452                 cfg.multisample_enable = msaa;
 453                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 454
 455                 /* EXT_shader_framebuffer_fetch requires per-sample */
 456                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 457                 cfg.evaluate_per_sample = msaa && per_sample;
 458
 459                 if (dev->quirks & MIDGARD_SFBD) {
 460                         cfg.sfbd_load_destination = blend[0].load_dest;
 461                         cfg.sfbd_blend_shader = blend[0].is_shader;
 462                 }
 463
 464                 cfg.depth_function = zsa->base.depth.enabled ?
 465                         panfrost_translate_compare_func(zsa->base.depth.func) :
 466                         MALI_FUNC_ALWAYS;
 467
 468                 cfg.depth_write_mask = zsa->base.depth.writemask;
 469                 cfg.near_discard = rast->depth_clip_near;
 470                 cfg.far_discard = rast->depth_clip_far;
 471                 cfg.unknown_2 = true;
 472         }
 473
 474         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 475                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 476                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 477                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 478                 cfg.alpha_to_coverage = alpha_to_coverage;
 479
 480                 if (dev->quirks & MIDGARD_SFBD) {
 481                         cfg.sfbd_write_enable = !blend[0].no_colour;
 482                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 483                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 484                 }
 485
 486                 cfg.unknown_1 = 0x7;
 487                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 488                 cfg.single_sampled_lines = !rast->multisample;
 489         }
 490
 491         if (dev->quirks & MIDGARD_SFBD) {
 492                 if (blend[0].is_shader) {
 493                         sfbd_blend.shader = blend[0].shader.gpu |
 494                                 blend[0].shader.first_tag;
 495                 } else {
 496                         sfbd_blend.equation = blend[0].equation.equation;
 497                         sfbd_blend.constant = blend[0].equation.constant;
 498                 }
 499         } else if (!(dev->quirks & IS_BIFROST)) {
 500                 /* Bug where MRT-capable hw apparently reads the last blend
 501                  * shader from here instead of the usual location? */
 502
 503                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 504                         if (!blend[rt].is_shader)
 505                                 continue;
 506
 507                         sfbd_blend.shader = blend[rt].shader.gpu |
 508                                                  blend[rt].shader.first_tag;
 509                         break;
 510                 }
 511         }
 512
 513         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 514                 cfg.shader = fs->shader;
 515                 cfg.properties = properties;
 516                 cfg.depth_units = rast->offset_units * 2.0f;
 517                 cfg.depth_factor = rast->offset_scale;
 518                 cfg.multisample_misc = multisample_misc;
 519                 cfg.stencil_mask_misc = stencil_mask_misc;
 520
 521                 cfg.stencil_front = zsa->stencil_front;
 522                 cfg.stencil_back = zsa->stencil_back;
 523
 524                 /* Bottom bits for stencil ref, exactly one word */
 525                 bool back_enab = zsa->base.stencil[1].enabled;
 526                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 527                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 528
 529                 if (dev->quirks & IS_BIFROST)
 530                         cfg.preload = preload;
 531                 else
 532                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 533         }
 534 }
 535
 536 mali_ptr
 537 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 538 {
 539         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 540
 541         panfrost_batch_add_bo(batch, ss->bo,
 542                               PAN_BO_ACCESS_PRIVATE |
 543                               PAN_BO_ACCESS_READ |
 544                               PAN_BO_ACCESS_VERTEX_TILER);
 545
 546         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 547                               PAN_BO_ACCESS_PRIVATE |
 548                               PAN_BO_ACCESS_READ |
 549                               PAN_BO_ACCESS_VERTEX_TILER);
 550
 551         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 552 }
 553
 554 mali_ptr
 555 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 556 {
 557         struct panfrost_context *ctx = batch->ctx;
 558         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 559
 560         /* Add the shader BO to the batch. */
 561         panfrost_batch_add_bo(batch, ss->bo,
 562                               PAN_BO_ACCESS_PRIVATE |
 563                               PAN_BO_ACCESS_READ |
 564                               PAN_BO_ACCESS_FRAGMENT);
 565
 566         struct panfrost_device *dev = pan_device(ctx->base.screen);
 567         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 568         struct panfrost_transfer xfer;
 569         unsigned rt_size;
 570
 571         if (dev->quirks & MIDGARD_SFBD)
 572                 rt_size = 0;
 573         else if (dev->quirks & IS_BIFROST)
 574                 rt_size = sizeof(struct bifrost_blend_rt);
 575         else
 576                 rt_size = sizeof(struct midgard_blend_rt);
 577
 578         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 579         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 580
 581         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 582
 583         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 584                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 585
 586         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 587
 588         if (!(dev->quirks & MIDGARD_SFBD))
 589                 panfrost_emit_blend(batch, xfer.cpu + MALI_STATE_LENGTH, blend);
 590         else
 591                 batch->draws |= PIPE_CLEAR_COLOR0;
 592
 593         return xfer.gpu;
 594 }
 595
 596 mali_ptr
 597 panfrost_emit_viewport(struct panfrost_batch *batch)
 598 {
 599         struct panfrost_context *ctx = batch->ctx;
 600         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 601         const struct pipe_scissor_state *ss = &ctx->scissor;
 602         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 603         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 604
 605         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 606          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 607          * translate + |scale|, so the ordering is correct here. */
 608         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 609         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 610         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 611         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 612         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 613         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 614
 615         /* Scissor to the intersection of viewport and to the scissor, clamped
 616          * to the framebuffer */
 617
 618         unsigned minx = MIN2(fb->width, vp_minx);
 619         unsigned maxx = MIN2(fb->width, vp_maxx);
 620         unsigned miny = MIN2(fb->height, vp_miny);
 621         unsigned maxy = MIN2(fb->height, vp_maxy);
 622
 623         if (ss && rast->scissor) {
 624                 minx = MAX2(ss->minx, minx);
 625                 miny = MAX2(ss->miny, miny);
 626                 maxx = MIN2(ss->maxx, maxx);
 627                 maxy = MIN2(ss->maxy, maxy);
 628         }
 629
 630         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 631
 632         pan_pack(T.cpu, VIEWPORT, cfg) {
 633                 cfg.scissor_minimum_x = minx;
 634                 cfg.scissor_minimum_y = miny;
 635                 cfg.scissor_maximum_x = maxx - 1;
 636                 cfg.scissor_maximum_y = maxy - 1;
 637
 638                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 639                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 640         }
 641
 642         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 643         return T.gpu;
 644 }
 645
 646 static mali_ptr
 647 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 648                                  enum pipe_shader_type st,
 649                                  struct panfrost_constant_buffer *buf,
 650                                  unsigned index)
 651 {
 652         struct pipe_constant_buffer *cb = &buf->cb[index];
 653         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 654
 655         if (rsrc) {
 656                 panfrost_batch_add_bo(batch, rsrc->bo,
 657                                       PAN_BO_ACCESS_SHARED |
 658                                       PAN_BO_ACCESS_READ |
 659                                       panfrost_bo_access_for_stage(st));
 660
 661                 /* Alignment gauranteed by
 662                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 663                 return rsrc->bo->gpu + cb->buffer_offset;
 664         } else if (cb->user_buffer) {
 665                 return panfrost_pool_upload_aligned(&batch->pool,
 666                                                  cb->user_buffer +
 667                                                  cb->buffer_offset,
 668                                                  cb->buffer_size, 16);
 669         } else {
 670                 unreachable("No constant buffer");
 671         }
 672 }
 673
 674 struct sysval_uniform {
 675         union {
 676                 float f[4];
 677                 int32_t i[4];
 678                 uint32_t u[4];
 679                 uint64_t du[2];
 680         };
 681 };
 682
 683 static void
 684 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 685                                       struct sysval_uniform *uniform)
 686 {
 687         struct panfrost_context *ctx = batch->ctx;
 688         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 689
 690         uniform->f[0] = vp->scale[0];
 691         uniform->f[1] = vp->scale[1];
 692         uniform->f[2] = vp->scale[2];
 693 }
 694
 695 static void
 696 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 697                                        struct sysval_uniform *uniform)
 698 {
 699         struct panfrost_context *ctx = batch->ctx;
 700         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 701
 702         uniform->f[0] = vp->translate[0];
 703         uniform->f[1] = vp->translate[1];
 704         uniform->f[2] = vp->translate[2];
 705 }
 706
 707 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 708                                        enum pipe_shader_type st,
 709                                        unsigned int sysvalid,
 710                                        struct sysval_uniform *uniform)
 711 {
 712         struct panfrost_context *ctx = batch->ctx;
 713         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 714         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 715         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 716         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 717
 718         assert(dim);
 719         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 720
 721         if (dim > 1)
 722                 uniform->i[1] = u_minify(tex->texture->height0,
 723                                          tex->u.tex.first_level);
 724
 725         if (dim > 2)
 726                 uniform->i[2] = u_minify(tex->texture->depth0,
 727                                          tex->u.tex.first_level);
 728
 729         if (is_array)
 730                 uniform->i[dim] = tex->texture->array_size;
 731 }
 732
 733 static void
 734 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 735                             enum pipe_shader_type st,
 736                             unsigned ssbo_id,
 737                             struct sysval_uniform *uniform)
 738 {
 739         struct panfrost_context *ctx = batch->ctx;
 740
 741         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 742         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 743
 744         /* Compute address */
 745         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 746
 747         panfrost_batch_add_bo(batch, bo,
 748                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 749                               panfrost_bo_access_for_stage(st));
 750
 751         /* Upload address and size as sysval */
 752         uniform->du[0] = bo->gpu + sb.buffer_offset;
 753         uniform->u[2] = sb.buffer_size;
 754 }
 755
 756 static void
 757 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 758                                enum pipe_shader_type st,
 759                                unsigned samp_idx,
 760                                struct sysval_uniform *uniform)
 761 {
 762         struct panfrost_context *ctx = batch->ctx;
 763         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 764
 765         uniform->f[0] = sampl->min_lod;
 766         uniform->f[1] = sampl->max_lod;
 767         uniform->f[2] = sampl->lod_bias;
 768
 769         /* Even without any errata, Midgard represents "no mipmapping" as
 770          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 771          * panfrost_create_sampler_state which also explains our choice of
 772          * epsilon value (again to keep behaviour consistent) */
 773
 774         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 775                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 776 }
 777
 778 static void
 779 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 780                                        struct sysval_uniform *uniform)
 781 {
 782         struct panfrost_context *ctx = batch->ctx;
 783
 784         uniform->u[0] = ctx->compute_grid->grid[0];
 785         uniform->u[1] = ctx->compute_grid->grid[1];
 786         uniform->u[2] = ctx->compute_grid->grid[2];
 787 }
 788
 789 static void
 790 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 791                         struct panfrost_shader_state *ss,
 792                         enum pipe_shader_type st)
 793 {
 794         struct sysval_uniform *uniforms = (void *)buf;
 795
 796         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 797                 int sysval = ss->sysval[i];
 798
 799                 switch (PAN_SYSVAL_TYPE(sysval)) {
 800                 case PAN_SYSVAL_VIEWPORT_SCALE:
 801                         panfrost_upload_viewport_scale_sysval(batch,
 802                                                               &uniforms[i]);
 803                         break;
 804                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 805                         panfrost_upload_viewport_offset_sysval(batch,
 806                                                                &uniforms[i]);
 807                         break;
 808                 case PAN_SYSVAL_TEXTURE_SIZE:
 809                         panfrost_upload_txs_sysval(batch, st,
 810                                                    PAN_SYSVAL_ID(sysval),
 811                                                    &uniforms[i]);
 812                         break;
 813                 case PAN_SYSVAL_SSBO:
 814                         panfrost_upload_ssbo_sysval(batch, st,
 815                                                     PAN_SYSVAL_ID(sysval),
 816                                                     &uniforms[i]);
 817                         break;
 818                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 819                         panfrost_upload_num_work_groups_sysval(batch,
 820                                                                &uniforms[i]);
 821                         break;
 822                 case PAN_SYSVAL_SAMPLER:
 823                         panfrost_upload_sampler_sysval(batch, st,
 824                                                        PAN_SYSVAL_ID(sysval),
 825                                                        &uniforms[i]);
 826                         break;
 827                 default:
 828                         assert(0);
 829                 }
 830         }
 831 }
 832
 833 static const void *
 834 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
 835                                  unsigned index)
 836 {
 837         struct pipe_constant_buffer *cb = &buf->cb[index];
 838         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 839
 840         if (rsrc)
 841                 return rsrc->bo->cpu;
 842         else if (cb->user_buffer)
 843                 return cb->user_buffer;
 844         else
 845                 unreachable("No constant buffer");
 846 }
 847
 848 mali_ptr
 849 panfrost_emit_const_buf(struct panfrost_batch *batch,
 850                         enum pipe_shader_type stage,
 851                         mali_ptr *push_constants)
 852 {
 853         struct panfrost_context *ctx = batch->ctx;
 854         struct panfrost_shader_variants *all = ctx->shader[stage];
 855
 856         if (!all)
 857                 return 0;
 858
 859         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
 860
 861         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
 862
 863         /* Uniforms are implicitly UBO #0 */
 864         bool has_uniforms = buf->enabled_mask & (1 << 0);
 865
 866         /* Allocate room for the sysval and the uniforms */
 867         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
 868         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
 869         size_t size = sys_size + uniform_size;
 870         struct panfrost_transfer transfer =
 871                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
 872
 873         /* Upload sysvals requested by the shader */
 874         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
 875
 876         /* Upload uniforms */
 877         if (has_uniforms && uniform_size) {
 878                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
 879                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
 880         }
 881
 882         /* Next up, attach UBOs. UBO #0 is the uniforms we just
 883          * uploaded, so it's always included. The count is the highest UBO
 884          * addressable -- gaps are included. */
 885
 886         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
 887
 888         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
 889         struct panfrost_transfer ubos =
 890                 panfrost_pool_alloc_aligned(&batch->pool, sz,
 891                                 MALI_UNIFORM_BUFFER_LENGTH);
 892
 893         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
 894
 895         /* Upload uniforms as a UBO */
 896
 897         if (size) {
 898                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
 899                         cfg.entries = DIV_ROUND_UP(size, 16);
 900                         cfg.pointer = transfer.gpu;
 901                 }
 902         } else {
 903                 *ubo_ptr = 0;
 904         }
 905
 906         /* The rest are honest-to-goodness UBOs */
 907
 908         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
 909                 size_t usz = buf->cb[ubo].buffer_size;
 910                 bool enabled = buf->enabled_mask & (1 << ubo);
 911                 bool empty = usz == 0;
 912
 913                 if (!enabled || empty) {
 914                         ubo_ptr[ubo] = 0;
 915                         continue;
 916                 }
 917
 918                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
 919                         cfg.entries = DIV_ROUND_UP(usz, 16);
 920                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
 921                                         stage, buf, ubo);
 922                 }
 923         }
 924
 925         *push_constants = transfer.gpu;
 926
 927         buf->dirty_mask = 0;
 928         return ubos.gpu;
 929 }
 930
 931 mali_ptr
 932 panfrost_emit_shared_memory(struct panfrost_batch *batch,
 933                             const struct pipe_grid_info *info)
 934 {
 935         struct panfrost_context *ctx = batch->ctx;
 936         struct panfrost_device *dev = pan_device(ctx->base.screen);
 937         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
 938         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
 939         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
 940                                                            128));
 941
 942         unsigned log2_instances =
 943                 util_logbase2_ceil(info->grid[0]) +
 944                 util_logbase2_ceil(info->grid[1]) +
 945                 util_logbase2_ceil(info->grid[2]);
 946
 947         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
 948         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
 949                                                                   shared_size,
 950                                                                   1);
 951
 952         struct mali_shared_memory shared = {
 953                 .shared_memory = bo->gpu,
 954                 .shared_workgroup_count = log2_instances,
 955                 .shared_shift = util_logbase2(single_size) + 1
 956         };
 957
 958         return panfrost_pool_upload_aligned(&batch->pool, &shared,
 959                         sizeof(shared), 64);
 960 }
 961
 962 static mali_ptr
 963 panfrost_get_tex_desc(struct panfrost_batch *batch,
 964                       enum pipe_shader_type st,
 965                       struct panfrost_sampler_view *view)
 966 {
 967         if (!view)
 968                 return (mali_ptr) 0;
 969
 970         struct pipe_sampler_view *pview = &view->base;
 971         struct panfrost_resource *rsrc = pan_resource(pview->texture);
 972
 973         /* Add the BO to the job so it's retained until the job is done. */
 974
 975         panfrost_batch_add_bo(batch, rsrc->bo,
 976                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
 977                               panfrost_bo_access_for_stage(st));
 978
 979         panfrost_batch_add_bo(batch, view->bo,
 980                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
 981                               panfrost_bo_access_for_stage(st));
 982
 983         return view->bo->gpu;
 984 }
 985
 986 static void
 987 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
 988                              struct pipe_context *pctx)
 989 {
 990         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
 991         if (view->texture_bo != rsrc->bo->gpu ||
 992             view->modifier != rsrc->modifier) {
 993                 panfrost_bo_unreference(view->bo);
 994                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
 995         }
 996 }
 997
 998 mali_ptr
 999 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1000                                   enum pipe_shader_type stage)
1001 {
1002         struct panfrost_context *ctx = batch->ctx;
1003         struct panfrost_device *device = pan_device(ctx->base.screen);
1004
1005         if (!ctx->sampler_view_count[stage])
1006                 return 0;
1007
1008         if (device->quirks & IS_BIFROST) {
1009                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1010                                 MALI_BIFROST_TEXTURE_LENGTH *
1011                                 ctx->sampler_view_count[stage],
1012                                 MALI_BIFROST_TEXTURE_LENGTH);
1013
1014                 struct mali_bifrost_texture_packed *out =
1015                         (struct mali_bifrost_texture_packed *) T.cpu;
1016
1017                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1018                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1019                         struct pipe_sampler_view *pview = &view->base;
1020                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1021
1022                         panfrost_update_sampler_view(view, &ctx->base);
1023                         out[i] = view->bifrost_descriptor;
1024
1025                         /* Add the BOs to the job so they are retained until the job is done. */
1026
1027                         panfrost_batch_add_bo(batch, rsrc->bo,
1028                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1029                                               panfrost_bo_access_for_stage(stage));
1030
1031                         panfrost_batch_add_bo(batch, view->bo,
1032                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1033                                               panfrost_bo_access_for_stage(stage));
1034                 }
1035
1036                 return T.gpu;
1037         } else {
1038                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1039
1040                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1041                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1042
1043                         panfrost_update_sampler_view(view, &ctx->base);
1044
1045                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1046                 }
1047
1048                 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1049                                 sizeof(uint64_t) *
1050                                 ctx->sampler_view_count[stage],
1051                                 sizeof(uint64_t));
1052         }
1053 }
1054
1055 mali_ptr
1056 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1057                                   enum pipe_shader_type stage)
1058 {
1059         struct panfrost_context *ctx = batch->ctx;
1060
1061         if (!ctx->sampler_count[stage])
1062                 return 0;
1063
1064         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1065         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1066
1067         size_t sz = desc_size * ctx->sampler_count[stage];
1068         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1069         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1070
1071         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1072                 out[i] = ctx->samplers[stage][i]->hw;
1073
1074         return T.gpu;
1075 }
1076
1077 mali_ptr
1078 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1079                           mali_ptr *buffers)
1080 {
1081         struct panfrost_context *ctx = batch->ctx;
1082         struct panfrost_vertex_state *so = ctx->vertex;
1083         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1084
1085         /* Worst case: everything is NPOT, which is only possible if instancing
1086          * is enabled. Otherwise single record is gauranteed */
1087         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1088                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1089                         (ctx->instance_count > 1 ? 2 : 1),
1090                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1091
1092         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1093                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1094                         MALI_ATTRIBUTE_LENGTH);
1095
1096         struct mali_attribute_buffer_packed *bufs =
1097                 (struct mali_attribute_buffer_packed *) S.cpu;
1098
1099         struct mali_attribute_packed *out =
1100                 (struct mali_attribute_packed *) T.cpu;
1101
1102         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1103         unsigned k = 0;
1104
1105         for (unsigned i = 0; i < so->num_elements; ++i) {
1106                 /* We map buffers 1:1 with the attributes, which
1107                  * means duplicating some vertex buffers (who cares? aside from
1108                  * maybe some caching implications but I somehow doubt that
1109                  * matters) */
1110
1111                 struct pipe_vertex_element *elem = &so->pipe[i];
1112                 unsigned vbi = elem->vertex_buffer_index;
1113                 attrib_to_buffer[i] = k;
1114
1115                 if (!(ctx->vb_mask & (1 << vbi)))
1116                         continue;
1117
1118                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1119                 struct panfrost_resource *rsrc;
1120
1121                 rsrc = pan_resource(buf->buffer.resource);
1122                 if (!rsrc)
1123                         continue;
1124
1125                 /* Add a dependency of the batch on the vertex buffer */
1126                 panfrost_batch_add_bo(batch, rsrc->bo,
1127                                       PAN_BO_ACCESS_SHARED |
1128                                       PAN_BO_ACCESS_READ |
1129                                       PAN_BO_ACCESS_VERTEX_TILER);
1130
1131                 /* Mask off lower bits, see offset fixup below */
1132                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1133                 mali_ptr addr = raw_addr & ~63;
1134
1135                 /* Since we advanced the base pointer, we shrink the buffer
1136                  * size, but add the offset we subtracted */
1137                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1138                         - buf->buffer_offset;
1139
1140                 /* When there is a divisor, the hardware-level divisor is
1141                  * the product of the instance divisor and the padded count */
1142                 unsigned divisor = elem->instance_divisor;
1143                 unsigned hw_divisor = ctx->padded_count * divisor;
1144                 unsigned stride = buf->stride;
1145
1146                 /* If there's a divisor(=1) but no instancing, we want every
1147                  * attribute to be the same */
1148
1149                 if (divisor && ctx->instance_count == 1)
1150                         stride = 0;
1151
1152                 if (!divisor || ctx->instance_count <= 1) {
1153                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1154                                 if (ctx->instance_count > 1) {
1155                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1156                                         cfg.divisor = ctx->padded_count;
1157                                 }
1158
1159                                 cfg.pointer = addr;
1160                                 cfg.stride = stride;
1161                                 cfg.size = size;
1162                         }
1163                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1164                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1165                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1166                                 cfg.pointer = addr;
1167                                 cfg.stride = stride;
1168                                 cfg.size = size;
1169                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1170                         }
1171
1172                 } else {
1173                         unsigned shift = 0, extra_flags = 0;
1174
1175                         unsigned magic_divisor =
1176                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1177
1178                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1179                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1180                                 cfg.pointer = addr;
1181                                 cfg.stride = stride;
1182                                 cfg.size = size;
1183
1184                                 cfg.divisor_r = shift;
1185                                 cfg.divisor_e = extra_flags;
1186                         }
1187
1188                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1189                                 cfg.divisor_numerator = magic_divisor;
1190                                 cfg.divisor = divisor;
1191                         }
1192
1193                         ++k;
1194                 }
1195
1196                 ++k;
1197         }
1198
1199         /* Add special gl_VertexID/gl_InstanceID buffers */
1200
1201         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1202                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1203
1204                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1205                         cfg.buffer_index = k++;
1206                         cfg.format = so->formats[PAN_VERTEX_ID];
1207                 }
1208
1209                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1210
1211                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1212                         cfg.buffer_index = k++;
1213                         cfg.format = so->formats[PAN_INSTANCE_ID];
1214                 }
1215         }
1216
1217         /* Attribute addresses require 64-byte alignment, so let:
1218          *
1219          *      base' = base & ~63 = base - (base & 63)
1220          *      offset' = offset + (base & 63)
1221          *
1222          * Since base' + offset' = base + offset, these are equivalent
1223          * addressing modes and now base is 64 aligned.
1224          */
1225
1226         for (unsigned i = 0; i < so->num_elements; ++i) {
1227                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1228                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1229
1230                 /* Adjust by the masked off bits of the offset. Make sure we
1231                  * read src_offset from so->hw (which is not GPU visible)
1232                  * rather than target (which is) due to caching effects */
1233
1234                 unsigned src_offset = so->pipe[i].src_offset;
1235
1236                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1237                 src_offset += (buf->buffer_offset & 63);
1238
1239                 /* Also, somewhat obscurely per-instance data needs to be
1240                  * offset in response to a delayed start in an indexed draw */
1241
1242                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1243                         src_offset -= buf->stride * ctx->offset_start;
1244
1245                 pan_pack(out + i, ATTRIBUTE, cfg) {
1246                         cfg.buffer_index = attrib_to_buffer[i];
1247                         cfg.format = so->formats[i];
1248                         cfg.offset = src_offset;
1249                 }
1250         }
1251
1252         *buffers = S.gpu;
1253         return T.gpu;
1254 }
1255
1256 static mali_ptr
1257 panfrost_emit_varyings(struct panfrost_batch *batch,
1258                 struct mali_attribute_buffer_packed *slot,
1259                 unsigned stride, unsigned count)
1260 {
1261         unsigned size = stride * count;
1262         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1263
1264         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1265                 cfg.stride = stride;
1266                 cfg.size = size;
1267                 cfg.pointer = ptr;
1268         }
1269
1270         return ptr;
1271 }
1272
1273 static unsigned
1274 panfrost_streamout_offset(unsigned stride,
1275                         struct pipe_stream_output_target *target)
1276 {
1277         return (target->buffer_offset + (pan_so_target(target)->offset * stride * 4)) & 63;
1278 }
1279
1280 static void
1281 panfrost_emit_streamout(struct panfrost_batch *batch,
1282                         struct mali_attribute_buffer_packed *slot,
1283                         unsigned stride_words, unsigned count,
1284                         struct pipe_stream_output_target *target)
1285 {
1286         unsigned stride = stride_words * 4;
1287         unsigned max_size = target->buffer_size;
1288         unsigned expected_size = stride * count;
1289
1290         /* Grab the BO and bind it to the batch */
1291         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1292
1293         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1294          * the perspective of the TILER and FRAGMENT.
1295          */
1296         panfrost_batch_add_bo(batch, bo,
1297                               PAN_BO_ACCESS_SHARED |
1298                               PAN_BO_ACCESS_RW |
1299                               PAN_BO_ACCESS_VERTEX_TILER |
1300                               PAN_BO_ACCESS_FRAGMENT);
1301
1302         /* We will have an offset applied to get alignment */
1303         mali_ptr addr = bo->gpu + target->buffer_offset + (pan_so_target(target)->offset * stride);
1304
1305         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1306                 cfg.pointer = (addr & ~63);
1307                 cfg.stride = stride;
1308                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1309         }
1310 }
1311
1312 /* Helpers for manipulating stream out information so we can pack varyings
1313  * accordingly. Compute the src_offset for a given captured varying */
1314
1315 static struct pipe_stream_output *
1316 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1317 {
1318         for (unsigned i = 0; i < info->num_outputs; ++i) {
1319                 if (info->output[i].register_index == loc)
1320                         return &info->output[i];
1321         }
1322
1323         unreachable("Varying not captured");
1324 }
1325
1326 static unsigned
1327 pan_varying_size(enum mali_format fmt)
1328 {
1329         unsigned type = MALI_EXTRACT_TYPE(fmt);
1330         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1331         unsigned bits = MALI_EXTRACT_BITS(fmt);
1332         unsigned bpc = 0;
1333
1334         if (bits == MALI_CHANNEL_FLOAT) {
1335                 /* No doubles */
1336                 bool fp16 = (type == MALI_FORMAT_SINT);
1337                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1338
1339                 bpc = fp16 ? 2 : 4;
1340         } else {
1341                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1342
1343                 /* See the enums */
1344                 bits = 1 << bits;
1345                 assert(bits >= 8);
1346                 bpc = bits / 8;
1347         }
1348
1349         return bpc * chan;
1350 }
1351
1352 /* Indices for named (non-XFB) varyings that are present. These are packed
1353  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1354  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1355  * of a given special field given a shift S by:
1356  *
1357  *      idx = popcount(P & ((1 << S) - 1))
1358  *
1359  * That is... look at all of the varyings that come earlier and count them, the
1360  * count is the new index since plus one. Likewise, the total number of special
1361  * buffers required is simply popcount(P)
1362  */
1363
1364 enum pan_special_varying {
1365         PAN_VARY_GENERAL = 0,
1366         PAN_VARY_POSITION = 1,
1367         PAN_VARY_PSIZ = 2,
1368         PAN_VARY_PNTCOORD = 3,
1369         PAN_VARY_FACE = 4,
1370         PAN_VARY_FRAGCOORD = 5,
1371
1372         /* Keep last */
1373         PAN_VARY_MAX,
1374 };
1375
1376 /* Given a varying, figure out which index it correpsonds to */
1377
1378 static inline unsigned
1379 pan_varying_index(unsigned present, enum pan_special_varying v)
1380 {
1381         unsigned mask = (1 << v) - 1;
1382         return util_bitcount(present & mask);
1383 }
1384
1385 /* Get the base offset for XFB buffers, which by convention come after
1386  * everything else. Wrapper function for semantic reasons; by construction this
1387  * is just popcount. */
1388
1389 static inline unsigned
1390 pan_xfb_base(unsigned present)
1391 {
1392         return util_bitcount(present);
1393 }
1394
1395 /* Computes the present mask for varyings so we can start emitting varying records */
1396
1397 static inline unsigned
1398 pan_varying_present(
1399         struct panfrost_shader_state *vs,
1400         struct panfrost_shader_state *fs,
1401         unsigned quirks,
1402         uint16_t point_coord_mask)
1403 {
1404         /* At the moment we always emit general and position buffers. Not
1405          * strictly necessary but usually harmless */
1406
1407         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1408
1409         /* Enable special buffers by the shader info */
1410
1411         if (vs->writes_point_size)
1412                 present |= (1 << PAN_VARY_PSIZ);
1413
1414         if (fs->reads_point_coord)
1415                 present |= (1 << PAN_VARY_PNTCOORD);
1416
1417         if (fs->reads_face)
1418                 present |= (1 << PAN_VARY_FACE);
1419
1420         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1421                 present |= (1 << PAN_VARY_FRAGCOORD);
1422
1423         /* Also, if we have a point sprite, we need a point coord buffer */
1424
1425         for (unsigned i = 0; i < fs->varying_count; i++)  {
1426                 gl_varying_slot loc = fs->varyings_loc[i];
1427
1428                 if (util_varying_is_point_coord(loc, point_coord_mask))
1429                         present |= (1 << PAN_VARY_PNTCOORD);
1430         }
1431
1432         return present;
1433 }
1434
1435 /* Emitters for varying records */
1436
1437 static void
1438 pan_emit_vary(struct mali_attribute_packed *out,
1439                 unsigned present, enum pan_special_varying buf,
1440                 unsigned quirks, enum mali_format format,
1441                 unsigned offset)
1442 {
1443         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1444         unsigned swizzle = quirks & HAS_SWIZZLES ?
1445                         panfrost_get_default_swizzle(nr_channels) :
1446                         panfrost_bifrost_swizzle(nr_channels);
1447
1448         pan_pack(out, ATTRIBUTE, cfg) {
1449                 cfg.buffer_index = pan_varying_index(present, buf);
1450                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1451                 cfg.format = (format << 12) | swizzle;
1452                 cfg.offset = offset;
1453         }
1454 }
1455
1456 /* General varying that is unused */
1457
1458 static void
1459 pan_emit_vary_only(struct mali_attribute_packed *out,
1460                 unsigned present, unsigned quirks)
1461 {
1462         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1463 }
1464
1465 /* Special records */
1466
1467 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1468         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1469         [PAN_VARY_PSIZ]         = MALI_R16F,
1470         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1471         [PAN_VARY_FACE]         = MALI_R32I,
1472         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1473 };
1474
1475 static void
1476 pan_emit_vary_special(struct mali_attribute_packed *out,
1477                 unsigned present, enum pan_special_varying buf,
1478                 unsigned quirks)
1479 {
1480         assert(buf < PAN_VARY_MAX);
1481         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1482 }
1483
1484 static enum mali_format
1485 pan_xfb_format(enum mali_format format, unsigned nr)
1486 {
1487         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1488                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1489         else
1490                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1491 }
1492
1493 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1494  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1495  * value. */
1496
1497 static void
1498 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1499                 unsigned present,
1500                 unsigned max_xfb,
1501                 unsigned *streamout_offsets,
1502                 unsigned quirks,
1503                 enum mali_format format,
1504                 struct pipe_stream_output o)
1505 {
1506         unsigned swizzle = quirks & HAS_SWIZZLES ?
1507                         panfrost_get_default_swizzle(o.num_components) :
1508                         panfrost_bifrost_swizzle(o.num_components);
1509
1510         pan_pack(out, ATTRIBUTE, cfg) {
1511                 /* XFB buffers come after everything else */
1512                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1513                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1514
1515                 /* Override number of channels and precision to highp */
1516                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1517
1518                 /* Apply given offsets together */
1519                 cfg.offset = (o.dst_offset * 4) /* dwords */
1520                         + streamout_offsets[o.output_buffer];
1521         }
1522 }
1523
1524 /* Determine if we should capture a varying for XFB. This requires actually
1525  * having a buffer for it. If we don't capture it, we'll fallback to a general
1526  * varying path (linked or unlinked, possibly discarding the write) */
1527
1528 static bool
1529 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1530                 unsigned loc, unsigned max_xfb)
1531 {
1532         if (!(xfb->so_mask & (1ll << loc)))
1533                 return false;
1534
1535         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1536         return o->output_buffer < max_xfb;
1537 }
1538
1539 static void
1540 pan_emit_general_varying(struct mali_attribute_packed *out,
1541                 struct panfrost_shader_state *other,
1542                 struct panfrost_shader_state *xfb,
1543                 gl_varying_slot loc,
1544                 enum mali_format format,
1545                 unsigned present,
1546                 unsigned quirks,
1547                 unsigned *gen_offsets,
1548                 enum mali_format *gen_formats,
1549                 unsigned *gen_stride,
1550                 unsigned idx,
1551                 bool should_alloc)
1552 {
1553         /* Check if we're linked */
1554         signed other_idx = -1;
1555
1556         for (unsigned j = 0; j < other->varying_count; ++j) {
1557                 if (other->varyings_loc[j] == loc) {
1558                         other_idx = j;
1559                         break;
1560                 }
1561         }
1562
1563         if (other_idx < 0) {
1564                 pan_emit_vary_only(out, present, quirks);
1565                 return;
1566         }
1567
1568         unsigned offset = gen_offsets[other_idx];
1569
1570         if (should_alloc) {
1571                 /* We're linked, so allocate a space via a watermark allocation */
1572                 enum mali_format alt = other->varyings[other_idx];
1573
1574                 /* Do interpolation at minimum precision */
1575                 unsigned size_main = pan_varying_size(format);
1576                 unsigned size_alt = pan_varying_size(alt);
1577                 unsigned size = MIN2(size_main, size_alt);
1578
1579                 /* If a varying is marked for XFB but not actually captured, we
1580                  * should match the format to the format that would otherwise
1581                  * be used for XFB, since dEQP checks for invariance here. It's
1582                  * unclear if this is required by the spec. */
1583
1584                 if (xfb->so_mask & (1ull << loc)) {
1585                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1586                         format = pan_xfb_format(format, o->num_components);
1587                         size = pan_varying_size(format);
1588                 } else if (size == size_alt) {
1589                         format = alt;
1590                 }
1591
1592                 gen_offsets[idx] = *gen_stride;
1593                 gen_formats[other_idx] = format;
1594                 offset = *gen_stride;
1595                 *gen_stride += size;
1596         }
1597
1598         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1599 }
1600
1601 /* Higher-level wrapper around all of the above, classifying a varying into one
1602  * of the above types */
1603
1604 static void
1605 panfrost_emit_varying(
1606                 struct mali_attribute_packed *out,
1607                 struct panfrost_shader_state *stage,
1608                 struct panfrost_shader_state *other,
1609                 struct panfrost_shader_state *xfb,
1610                 unsigned present,
1611                 uint16_t point_sprite_mask,
1612                 unsigned max_xfb,
1613                 unsigned *streamout_offsets,
1614                 unsigned quirks,
1615                 unsigned *gen_offsets,
1616                 enum mali_format *gen_formats,
1617                 unsigned *gen_stride,
1618                 unsigned idx,
1619                 bool should_alloc,
1620                 bool is_fragment)
1621 {
1622         gl_varying_slot loc = stage->varyings_loc[idx];
1623         enum mali_format format = stage->varyings[idx];
1624
1625         /* Override format to match linkage */
1626         if (!should_alloc && gen_formats[idx])
1627                 format = gen_formats[idx];
1628
1629         if (util_varying_is_point_coord(loc, point_sprite_mask)) {
1630                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1631         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1632                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1633                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1634         } else if (loc == VARYING_SLOT_POS) {
1635                 if (is_fragment)
1636                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1637                 else
1638                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1639         } else if (loc == VARYING_SLOT_PSIZ) {
1640                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1641         } else if (loc == VARYING_SLOT_PNTC) {
1642                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1643         } else if (loc == VARYING_SLOT_FACE) {
1644                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1645         } else {
1646                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1647                                 quirks, gen_offsets, gen_formats, gen_stride,
1648                                 idx, should_alloc);
1649         }
1650 }
1651
1652 static void
1653 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1654                 unsigned present,
1655                 enum pan_special_varying v,
1656                 unsigned special)
1657 {
1658         if (present & (1 << v)) {
1659                 unsigned idx = pan_varying_index(present, v);
1660
1661                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1662                         cfg.special = special;
1663                         cfg.type = 0;
1664                 }
1665         }
1666 }
1667
1668 void
1669 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1670                                  unsigned vertex_count,
1671                                  mali_ptr *vs_attribs,
1672                                  mali_ptr *fs_attribs,
1673                                  mali_ptr *buffers,
1674                                  mali_ptr *position,
1675                                  mali_ptr *psiz)
1676 {
1677         /* Load the shaders */
1678         struct panfrost_context *ctx = batch->ctx;
1679         struct panfrost_device *dev = pan_device(ctx->base.screen);
1680         struct panfrost_shader_state *vs, *fs;
1681         size_t vs_size, fs_size;
1682
1683         /* Allocate the varying descriptor */
1684
1685         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1686         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1687         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1688         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1689
1690         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1691                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1692
1693         struct pipe_stream_output_info *so = &vs->stream_output;
1694         uint16_t point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
1695         unsigned present = pan_varying_present(vs, fs, dev->quirks, point_coord_mask);
1696
1697         /* Check if this varying is linked by us. This is the case for
1698          * general-purpose, non-captured varyings. If it is, link it. If it's
1699          * not, use the provided stream out information to determine the
1700          * offset, since it was already linked for us. */
1701
1702         unsigned gen_offsets[32];
1703         enum mali_format gen_formats[32];
1704         memset(gen_offsets, 0, sizeof(gen_offsets));
1705         memset(gen_formats, 0, sizeof(gen_formats));
1706
1707         unsigned gen_stride = 0;
1708         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1709         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1710
1711         unsigned streamout_offsets[32];
1712
1713         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1714                 streamout_offsets[i] = panfrost_streamout_offset(
1715                                         so->stride[i],
1716                                         ctx->streamout.targets[i]);
1717         }
1718
1719         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1720         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1721
1722         for (unsigned i = 0; i < vs->varying_count; i++) {
1723                 panfrost_emit_varying(ovs + i, vs, fs, vs, present, 0,
1724                                 ctx->streamout.num_targets, streamout_offsets,
1725                                 dev->quirks,
1726                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1727         }
1728
1729         for (unsigned i = 0; i < fs->varying_count; i++) {
1730                 panfrost_emit_varying(ofs + i, fs, vs, vs, present, point_coord_mask,
1731                                 ctx->streamout.num_targets, streamout_offsets,
1732                                 dev->quirks,
1733                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1734         }
1735
1736         unsigned xfb_base = pan_xfb_base(present);
1737         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1738                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1739                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1740         struct mali_attribute_buffer_packed *varyings =
1741                 (struct mali_attribute_buffer_packed *) T.cpu;
1742
1743         /* Emit the stream out buffers */
1744
1745         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1746                                                            ctx->vertex_count);
1747
1748         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1749                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1750                                         so->stride[i],
1751                                         out_count,
1752                                         ctx->streamout.targets[i]);
1753         }
1754
1755         panfrost_emit_varyings(batch,
1756                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1757                         gen_stride, vertex_count);
1758
1759         /* fp32 vec4 gl_Position */
1760         *position = panfrost_emit_varyings(batch,
1761                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1762                         sizeof(float) * 4, vertex_count);
1763
1764         if (present & (1 << PAN_VARY_PSIZ)) {
1765                 *psiz = panfrost_emit_varyings(batch,
1766                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1767                                 2, vertex_count);
1768         }
1769
1770         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1771         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1772         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1773
1774         *buffers = T.gpu;
1775         *vs_attribs = trans.gpu;
1776         *fs_attribs = trans.gpu + vs_size;
1777 }
1778
1779 void
1780 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1781                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1782                                 struct mali_draw_packed *vertex_draw,
1783                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1784                                 struct mali_draw_packed *tiler_draw,
1785                                 union midgard_primitive_size *primitive_size)
1786 {
1787         struct panfrost_context *ctx = batch->ctx;
1788         struct panfrost_device *device = pan_device(ctx->base.screen);
1789         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1790         struct bifrost_payload_vertex bifrost_vertex = {0,};
1791         struct bifrost_payload_tiler bifrost_tiler = {0,};
1792         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1793         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1794         void *vp, *tp;
1795         size_t vp_size, tp_size;
1796
1797         if (device->quirks & IS_BIFROST) {
1798                 bifrost_vertex.prefix = *vertex_prefix;
1799                 memcpy(&bifrost_vertex.postfix, vertex_draw, MALI_DRAW_LENGTH);
1800                 vp = &bifrost_vertex;
1801                 vp_size = sizeof(bifrost_vertex);
1802
1803                 bifrost_tiler.prefix = *tiler_prefix;
1804                 bifrost_tiler.primitive_size = *primitive_size;
1805                 bifrost_tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1806                 memcpy(&bifrost_tiler.postfix, tiler_draw, MALI_DRAW_LENGTH);
1807                 tp = &bifrost_tiler;
1808                 tp_size = sizeof(bifrost_tiler);
1809         } else {
1810                 midgard_vertex.prefix = *vertex_prefix;
1811                 memcpy(&midgard_vertex.postfix, vertex_draw, MALI_DRAW_LENGTH);
1812                 vp = &midgard_vertex;
1813                 vp_size = sizeof(midgard_vertex);
1814
1815                 midgard_tiler.prefix = *tiler_prefix;
1816                 memcpy(&midgard_tiler.postfix, tiler_draw, MALI_DRAW_LENGTH);
1817                 midgard_tiler.primitive_size = *primitive_size;
1818                 tp = &midgard_tiler;
1819                 tp_size = sizeof(midgard_tiler);
1820         }
1821
1822         if (wallpapering) {
1823                 /* Inject in reverse order, with "predicted" job indices.
1824                  * THIS IS A HACK XXX */
1825                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
1826                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
1827                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1828                                  vp, vp_size, true);
1829                 return;
1830         }
1831
1832         /* If rasterizer discard is enable, only submit the vertex */
1833
1834         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1835                                            vp, vp_size, false);
1836
1837         if (ctx->rasterizer->base.rasterizer_discard)
1838                 return;
1839
1840         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
1841                          false);
1842 }
1843
1844 /* TODO: stop hardcoding this */
1845 mali_ptr
1846 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1847 {
1848         uint16_t locations[] = {
1849             128, 128,
1850             0, 256,
1851             0, 256,
1852             0, 256,
1853             0, 256,
1854             0, 256,
1855             0, 256,
1856             0, 256,
1857             0, 256,
1858             0, 256,
1859             0, 256,
1860             0, 256,
1861             0, 256,
1862             0, 256,
1863             0, 256,
1864             0, 256,
1865             0, 256,
1866             0, 256,
1867             0, 256,
1868             0, 256,
1869             0, 256,
1870             0, 256,
1871             0, 256,
1872             0, 256,
1873             0, 256,
1874             0, 256,
1875             0, 256,
1876             0, 256,
1877             0, 256,
1878             0, 256,
1879             0, 256,
1880             0, 256,
1881             128, 128,
1882             0, 0,
1883             0, 0,
1884             0, 0,
1885             0, 0,
1886             0, 0,
1887             0, 0,
1888             0, 0,
1889             0, 0,
1890             0, 0,
1891             0, 0,
1892             0, 0,
1893             0, 0,
1894             0, 0,
1895             0, 0,
1896             0, 0,
1897         };
1898
1899         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
1900 }