src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static mali_ptr
  55 panfrost_vt_emit_shared_memory(struct panfrost_batch *batch)
  56 {
  57         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
  58
  59         struct mali_shared_memory shared = {
  60                 .shared_workgroup_count = ~0,
  61         };
  62
  63         if (batch->stack_size) {
  64                 struct panfrost_bo *stack =
  65                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  66                                         dev->thread_tls_alloc,
  67                                         dev->core_count);
  68
  69                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  70                 shared.scratchpad = stack->gpu;
  71         }
  72
  73         return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  74 }
  75
  76 void
  77 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  78                                   struct mali_vertex_tiler_prefix *prefix,
  79                                   union midgard_primitive_size *primitive_size)
  80 {
  81         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
  82
  83         if (!panfrost_writes_point_size(ctx)) {
  84                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
  85                               rasterizer->base.point_size :
  86                               rasterizer->base.line_width;
  87
  88                 primitive_size->constant = val;
  89         }
  90 }
  91
  92 void
  93 panfrost_vt_init(struct panfrost_context *ctx,
  94                  enum pipe_shader_type stage,
  95                  struct mali_vertex_tiler_prefix *prefix,
  96                  struct mali_vertex_tiler_postfix *postfix)
  97 {
  98         struct panfrost_device *device = pan_device(ctx->base.screen);
  99         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 100
 101         if (!ctx->shader[stage])
 102                 return;
 103
 104         memset(prefix, 0, sizeof(*prefix));
 105         memset(postfix, 0, sizeof(*postfix));
 106
 107         if (device->quirks & IS_BIFROST) {
 108                 postfix->gl_enables = 0x2;
 109                 postfix->shared_memory = panfrost_vt_emit_shared_memory(batch);
 110         } else {
 111                 postfix->gl_enables = 0x6;
 112                 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
 113         }
 114
 115         if (stage == PIPE_SHADER_FRAGMENT) {
 116                 if (ctx->occlusion_query) {
 117                         postfix->gl_enables |= MALI_OCCLUSION_QUERY;
 118                         postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 119                         panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 120                                               PAN_BO_ACCESS_SHARED |
 121                                               PAN_BO_ACCESS_RW |
 122                                               PAN_BO_ACCESS_FRAGMENT);
 123                 }
 124
 125                 postfix->gl_enables |= 0x7;
 126                 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 127                 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 128                         rast->front_ccw);
 129                 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 130                         (rast->cull_face & PIPE_FACE_FRONT));
 131                 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 132                         (rast->cull_face & PIPE_FACE_BACK));
 133                 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 134                         rast->flatshade_first);
 135         }
 136 }
 137
 138 static unsigned
 139 panfrost_translate_index_size(unsigned size)
 140 {
 141         switch (size) {
 142         case 1:
 143                 return MALI_DRAW_INDEXED_UINT8;
 144
 145         case 2:
 146                 return MALI_DRAW_INDEXED_UINT16;
 147
 148         case 4:
 149                 return MALI_DRAW_INDEXED_UINT32;
 150
 151         default:
 152                 unreachable("Invalid index size");
 153         }
 154 }
 155
 156 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 157  * good for the duration of the draw (transient), could last longer. Also get
 158  * the bounds on the index buffer for the range accessed by the draw. We do
 159  * these operations together because there are natural optimizations which
 160  * require them to be together. */
 161
 162 static mali_ptr
 163 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 164                                   const struct pipe_draw_info *info,
 165                                   unsigned *min_index, unsigned *max_index)
 166 {
 167         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 168         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 169         off_t offset = info->start * info->index_size;
 170         bool needs_indices = true;
 171         mali_ptr out = 0;
 172
 173         if (info->max_index != ~0u) {
 174                 *min_index = info->min_index;
 175                 *max_index = info->max_index;
 176                 needs_indices = false;
 177         }
 178
 179         if (!info->has_user_indices) {
 180                 /* Only resources can be directly mapped */
 181                 panfrost_batch_add_bo(batch, rsrc->bo,
 182                                       PAN_BO_ACCESS_SHARED |
 183                                       PAN_BO_ACCESS_READ |
 184                                       PAN_BO_ACCESS_VERTEX_TILER);
 185                 out = rsrc->bo->gpu + offset;
 186
 187                 /* Check the cache */
 188                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 189                                                            info->start,
 190                                                            info->count,
 191                                                            min_index,
 192                                                            max_index);
 193         } else {
 194                 /* Otherwise, we need to upload to transient memory */
 195                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 196                 struct panfrost_transfer T =
 197                         panfrost_pool_alloc_aligned(&batch->pool,
 198                                 info->count * info->index_size,
 199                                 info->index_size);
 200
 201                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 202                 out = T.gpu;
 203         }
 204
 205         if (needs_indices) {
 206                 /* Fallback */
 207                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 208
 209                 if (!info->has_user_indices)
 210                         panfrost_minmax_cache_add(rsrc->index_cache,
 211                                                   info->start, info->count,
 212                                                   *min_index, *max_index);
 213         }
 214
 215         return out;
 216 }
 217
 218 void
 219 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 220                           const struct pipe_draw_info *info,
 221                           enum mali_draw_mode draw_mode,
 222                           struct mali_vertex_tiler_postfix *vertex_postfix,
 223                           struct mali_vertex_tiler_prefix *tiler_prefix,
 224                           struct mali_vertex_tiler_postfix *tiler_postfix,
 225                           unsigned *vertex_count,
 226                           unsigned *padded_count)
 227 {
 228         tiler_prefix->draw_mode = draw_mode;
 229
 230         unsigned draw_flags = 0;
 231
 232         if (panfrost_writes_point_size(ctx))
 233                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 234
 235         if (info->primitive_restart)
 236                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 237
 238         /* These doesn't make much sense */
 239
 240         draw_flags |= 0x3000;
 241
 242         if (info->index_size) {
 243                 unsigned min_index = 0, max_index = 0;
 244
 245                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 246                                                                        info,
 247                                                                        &min_index,
 248                                                                        &max_index);
 249
 250                 /* Use the corresponding values */
 251                 *vertex_count = max_index - min_index + 1;
 252                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 253                 tiler_prefix->offset_bias_correction = -min_index;
 254                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 255                 draw_flags |= panfrost_translate_index_size(info->index_size);
 256         } else {
 257                 tiler_prefix->indices = 0;
 258                 *vertex_count = ctx->vertex_count;
 259                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 260                 tiler_prefix->offset_bias_correction = 0;
 261                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 262         }
 263
 264         tiler_prefix->unknown_draw = draw_flags;
 265
 266         /* Encode the padded vertex count */
 267
 268         if (info->instance_count > 1) {
 269                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 270
 271                 unsigned shift = __builtin_ctz(ctx->padded_count);
 272                 unsigned k = ctx->padded_count >> (shift + 1);
 273
 274                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 275                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 276         } else {
 277                 *padded_count = *vertex_count;
 278
 279                 /* Reset instancing state */
 280                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 281                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 282         }
 283 }
 284
 285 static unsigned
 286 translate_tex_wrap(enum pipe_tex_wrap w)
 287 {
 288         switch (w) {
 289         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 290         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 291         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 292         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 293         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 294         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 295         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 296         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 297         default: unreachable("Invalid wrap");
 298         }
 299 }
 300
 301 /* The hardware compares in the wrong order order, so we have to flip before
 302  * encoding. Yes, really. */
 303
 304 static enum mali_func
 305 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 306 {
 307         if (!cso->compare_mode)
 308                 return MALI_FUNC_NEVER;
 309
 310         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 311         return panfrost_flip_compare_func(f);
 312 }
 313
 314 static enum mali_mipmap_mode
 315 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 316 {
 317         switch (f) {
 318         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 319         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 320         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 321         default: unreachable("Invalid");
 322         }
 323 }
 324
 325 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 326                                 struct mali_midgard_sampler_packed *hw)
 327 {
 328         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 329                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 330                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 331                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 332                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 333                 cfg.normalized_coordinates = cso->normalized_coords;
 334
 335                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 336
 337                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 338
 339                 /* If necessary, we disable mipmapping in the sampler descriptor by
 340                  * clamping the LOD as tight as possible (from 0 to epsilon,
 341                  * essentially -- remember these are fixed point numbers, so
 342                  * epsilon=1/256) */
 343
 344                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 345                         cfg.minimum_lod + 1 :
 346                         FIXED_16(cso->max_lod, false);
 347
 348                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 349                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 350                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 351
 352                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 353                 cfg.seamless_cube_map = cso->seamless_cube_map;
 354
 355                 cfg.border_color_r = cso->border_color.f[0];
 356                 cfg.border_color_g = cso->border_color.f[1];
 357                 cfg.border_color_b = cso->border_color.f[2];
 358                 cfg.border_color_a = cso->border_color.f[3];
 359         }
 360 }
 361
 362 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 363                                         struct mali_bifrost_sampler_packed *hw)
 364 {
 365         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 366                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 367                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 368                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 369                 cfg.normalized_coordinates = cso->normalized_coords;
 370
 371                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 372                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 373                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 374
 375                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 376                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 377                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 378
 379                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 380                 cfg.seamless_cube_map = cso->seamless_cube_map;
 381         }
 382 }
 383
 384 static bool
 385 panfrost_fs_required(
 386                 struct panfrost_shader_state *fs,
 387                 struct panfrost_blend_final *blend,
 388                 unsigned rt_count)
 389 {
 390         /* If we generally have side effects */
 391         if (fs->fs_sidefx)
 392                 return true;
 393
 394         /* If colour is written we need to execute */
 395         for (unsigned i = 0; i < rt_count; ++i) {
 396                 if (!blend[i].no_colour)
 397                         return true;
 398         }
 399
 400         /* If depth is written and not implied we need to execute.
 401          * TODO: Predicate on Z/S writes being enabled */
 402         return (fs->writes_depth || fs->writes_stencil);
 403 }
 404
 405 static void
 406 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 407                 struct panfrost_blend_final *blend)
 408 {
 409         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 410         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 411         unsigned rt_count = batch->key.nr_cbufs;
 412
 413         struct bifrost_blend_rt *brts = rts;
 414
 415         /* Disable blending for depth-only */
 416
 417         if (rt_count == 0) {
 418                 if (dev->quirks & IS_BIFROST) {
 419                         memset(brts, 0, sizeof(*brts));
 420                         brts[0].unk2 = 0x3;
 421                 } else {
 422                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 423                                 cfg.equation = 0xf0122122; /* Replace */
 424                         }
 425                 }
 426         }
 427
 428         for (unsigned i = 0; i < rt_count; ++i) {
 429                 struct mali_blend_flags_packed flags = {};
 430
 431                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 432                         if (blend[i].no_colour) {
 433                                 cfg.enable = false;
 434                                 break;
 435                         }
 436
 437                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 438
 439                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 440                         cfg.load_destination = blend[i].load_dest;
 441                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 442
 443                         if (!(dev->quirks & IS_BIFROST))
 444                                 cfg.midgard_blend_shader = blend[i].is_shader;
 445                 }
 446
 447                 if (dev->quirks & IS_BIFROST) {
 448                         memset(brts + i, 0, sizeof(brts[i]));
 449                         brts[i].flags = flags.opaque[0];
 450
 451                         if (blend[i].is_shader) {
 452                                 /* The blend shader's address needs to be at
 453                                  * the same top 32 bit as the fragment shader.
 454                                  * TODO: Ensure that's always the case.
 455                                  */
 456                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 457                                        (fs->bo->gpu & (0xffffffffull << 32)));
 458                                 brts[i].shader = blend[i].shader.gpu;
 459                                 brts[i].unk2 = 0x0;
 460                         } else {
 461                                 enum pipe_format format = batch->key.cbufs[i]->format;
 462                                 const struct util_format_description *format_desc;
 463                                 format_desc = util_format_description(format);
 464
 465                                 brts[i].equation = blend[i].equation.equation;
 466
 467                                 /* TODO: this is a bit more complicated */
 468                                 brts[i].constant = blend[i].equation.constant;
 469
 470                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 471
 472                                 /* 0x19 disables blending and forces REPLACE
 473                                  * mode (equivalent to rgb_mode = alpha_mode =
 474                                  * x122, colour mask = 0xF). 0x1a allows
 475                                  * blending. */
 476                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 477
 478                                 brts[i].shader_type = fs->blend_types[i];
 479                         }
 480                 } else {
 481                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 482                                 cfg.flags = flags;
 483
 484                                 if (blend[i].is_shader) {
 485                                         cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 486                                 } else {
 487                                         cfg.equation = blend[i].equation.equation.opaque[0];
 488                                         cfg.constant = blend[i].equation.constant;
 489                                 }
 490                         }
 491
 492                         rts += MALI_MIDGARD_BLEND_LENGTH;
 493                 }
 494         }
 495 }
 496
 497 static void
 498 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 499                                struct mali_state_packed *fragmeta,
 500                                struct panfrost_blend_final *blend)
 501 {
 502         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 503         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 504         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 505         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 506         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 507         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 508
 509         /* Built up here */
 510         struct mali_shader_packed shader = fs->shader;
 511         struct mali_preload_packed preload = fs->preload;
 512         uint32_t properties;
 513         struct mali_multisample_misc_packed multisample_misc;
 514         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 515         union midgard_blend sfbd_blend = { 0 };
 516
 517         if (!panfrost_fs_required(fs, blend, rt_count)) {
 518                 if (dev->quirks & IS_BIFROST) {
 519                         pan_pack(&shader, SHADER, cfg) {}
 520
 521                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 522                                 cfg.unknown = 0x950020; /* XXX */
 523                                 cfg.early_z_enable = true;
 524                         }
 525
 526                         preload.opaque[0] = 0;
 527                 } else {
 528                         pan_pack(&shader, SHADER, cfg) {
 529                                 cfg.shader = 0x1;
 530                         }
 531
 532                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 533                                 cfg.work_register_count = 1;
 534                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 535                                 cfg.early_z_enable = true;
 536                         }
 537                 }
 538         } else if (dev->quirks & IS_BIFROST) {
 539                 bool no_blend = true;
 540
 541                 for (unsigned i = 0; i < rt_count; ++i)
 542                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 543
 544                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 545                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 546                 }
 547
 548                 /* Combine with prepacked properties */
 549                 properties |= fs->properties.opaque[0];
 550         } else {
 551                 /* Reasons to disable early-Z from a shader perspective */
 552                 bool late_z = fs->can_discard || fs->writes_global ||
 553                         fs->writes_depth || fs->writes_stencil;
 554
 555                 /* If either depth or stencil is enabled, discard matters */
 556                 bool zs_enabled =
 557                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 558                         zsa->base.stencil[0].enabled;
 559
 560                 bool has_blend_shader = false;
 561
 562                 for (unsigned c = 0; c < rt_count; ++c)
 563                         has_blend_shader |= blend[c].is_shader;
 564
 565                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 566                         /* TODO: Reduce this limit? */
 567                         if (has_blend_shader)
 568                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 569                         else
 570                                 cfg.work_register_count = fs->work_reg_count;
 571
 572                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 573                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 574                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 575                 }
 576
 577                 properties |= fs->properties.opaque[0];
 578         }
 579
 580         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 581                 bool msaa = rast->multisample;
 582                 cfg.multisample_enable = msaa;
 583                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 584
 585                 /* EXT_shader_framebuffer_fetch requires per-sample */
 586                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 587                 cfg.evaluate_per_sample = msaa && per_sample;
 588
 589                 if (dev->quirks & MIDGARD_SFBD) {
 590                         cfg.sfbd_load_destination = blend[0].load_dest;
 591                         cfg.sfbd_blend_shader = blend[0].is_shader;
 592                 }
 593
 594                 cfg.depth_function = zsa->base.depth.enabled ?
 595                         panfrost_translate_compare_func(zsa->base.depth.func) :
 596                         MALI_FUNC_ALWAYS;
 597
 598                 cfg.depth_write_mask = zsa->base.depth.writemask;
 599                 cfg.near_discard = rast->depth_clip_near;
 600                 cfg.far_discard = rast->depth_clip_far;
 601                 cfg.unknown_2 = true;
 602         }
 603
 604         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 605                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 606                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 607                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 608                 cfg.alpha_to_coverage = alpha_to_coverage;
 609
 610                 if (dev->quirks & MIDGARD_SFBD) {
 611                         cfg.sfbd_write_enable = !blend[0].no_colour;
 612                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 613                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 614                 }
 615
 616                 cfg.unknown_1 = 0x7;
 617                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 618                 cfg.single_sampled_lines = !rast->multisample;
 619         }
 620
 621         if (dev->quirks & MIDGARD_SFBD) {
 622                 if (blend[0].is_shader) {
 623                         sfbd_blend.shader = blend[0].shader.gpu |
 624                                 blend[0].shader.first_tag;
 625                 } else {
 626                         sfbd_blend.equation = blend[0].equation.equation;
 627                         sfbd_blend.constant = blend[0].equation.constant;
 628                 }
 629         } else if (!(dev->quirks & IS_BIFROST)) {
 630                 /* Bug where MRT-capable hw apparently reads the last blend
 631                  * shader from here instead of the usual location? */
 632
 633                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 634                         if (!blend[rt].is_shader)
 635                                 continue;
 636
 637                         sfbd_blend.shader = blend[rt].shader.gpu |
 638                                                  blend[rt].shader.first_tag;
 639                         break;
 640                 }
 641         }
 642
 643         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 644                 cfg.shader = fs->shader;
 645                 cfg.properties = properties;
 646                 cfg.depth_units = rast->offset_units * 2.0f;
 647                 cfg.depth_factor = rast->offset_scale;
 648                 cfg.multisample_misc = multisample_misc;
 649                 cfg.stencil_mask_misc = stencil_mask_misc;
 650
 651                 cfg.stencil_front = zsa->stencil_front;
 652                 cfg.stencil_back = zsa->stencil_back;
 653
 654                 /* Bottom bits for stencil ref, exactly one word */
 655                 bool back_enab = zsa->base.stencil[1].enabled;
 656                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 657                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 658
 659                 if (dev->quirks & IS_BIFROST)
 660                         cfg.preload = preload;
 661                 else
 662                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 663         }
 664 }
 665
 666 mali_ptr
 667 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 668 {
 669         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 670
 671         panfrost_batch_add_bo(batch, ss->bo,
 672                               PAN_BO_ACCESS_PRIVATE |
 673                               PAN_BO_ACCESS_READ |
 674                               PAN_BO_ACCESS_VERTEX_TILER);
 675
 676         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 677                               PAN_BO_ACCESS_PRIVATE |
 678                               PAN_BO_ACCESS_READ |
 679                               PAN_BO_ACCESS_VERTEX_TILER);
 680
 681         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 682 }
 683
 684 mali_ptr
 685 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 686 {
 687         struct panfrost_context *ctx = batch->ctx;
 688         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 689
 690         /* Add the shader BO to the batch. */
 691         panfrost_batch_add_bo(batch, ss->bo,
 692                               PAN_BO_ACCESS_PRIVATE |
 693                               PAN_BO_ACCESS_READ |
 694                               PAN_BO_ACCESS_FRAGMENT);
 695
 696         struct panfrost_device *dev = pan_device(ctx->base.screen);
 697         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 698         struct panfrost_transfer xfer;
 699         unsigned rt_size;
 700
 701         if (dev->quirks & MIDGARD_SFBD)
 702                 rt_size = 0;
 703         else if (dev->quirks & IS_BIFROST)
 704                 rt_size = sizeof(struct bifrost_blend_rt);
 705         else
 706                 rt_size = sizeof(struct midgard_blend_rt);
 707
 708         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 709         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 710
 711         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 712
 713         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 714                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 715
 716         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 717
 718         if (!(dev->quirks & MIDGARD_SFBD))
 719                 panfrost_emit_blend(batch, xfer.cpu + MALI_STATE_LENGTH, blend);
 720         else
 721                 batch->draws |= PIPE_CLEAR_COLOR0;
 722
 723         return xfer.gpu;
 724 }
 725
 726 mali_ptr
 727 panfrost_emit_viewport(struct panfrost_batch *batch)
 728 {
 729         struct panfrost_context *ctx = batch->ctx;
 730         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 731         const struct pipe_scissor_state *ss = &ctx->scissor;
 732         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 733         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 734
 735         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 736          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 737          * translate + |scale|, so the ordering is correct here. */
 738         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 739         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 740         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 741         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 742         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 743         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 744
 745         /* Scissor to the intersection of viewport and to the scissor, clamped
 746          * to the framebuffer */
 747
 748         unsigned minx = MIN2(fb->width, vp_minx);
 749         unsigned maxx = MIN2(fb->width, vp_maxx);
 750         unsigned miny = MIN2(fb->height, vp_miny);
 751         unsigned maxy = MIN2(fb->height, vp_maxy);
 752
 753         if (ss && rast->scissor) {
 754                 minx = MAX2(ss->minx, minx);
 755                 miny = MAX2(ss->miny, miny);
 756                 maxx = MIN2(ss->maxx, maxx);
 757                 maxy = MIN2(ss->maxy, maxy);
 758         }
 759
 760         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 761
 762         pan_pack(T.cpu, VIEWPORT, cfg) {
 763                 cfg.scissor_minimum_x = minx;
 764                 cfg.scissor_minimum_y = miny;
 765                 cfg.scissor_maximum_x = maxx - 1;
 766                 cfg.scissor_maximum_y = maxy - 1;
 767
 768                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 769                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 770         }
 771
 772         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 773         return T.gpu;
 774 }
 775
 776 static mali_ptr
 777 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 778                                  enum pipe_shader_type st,
 779                                  struct panfrost_constant_buffer *buf,
 780                                  unsigned index)
 781 {
 782         struct pipe_constant_buffer *cb = &buf->cb[index];
 783         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 784
 785         if (rsrc) {
 786                 panfrost_batch_add_bo(batch, rsrc->bo,
 787                                       PAN_BO_ACCESS_SHARED |
 788                                       PAN_BO_ACCESS_READ |
 789                                       panfrost_bo_access_for_stage(st));
 790
 791                 /* Alignment gauranteed by
 792                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 793                 return rsrc->bo->gpu + cb->buffer_offset;
 794         } else if (cb->user_buffer) {
 795                 return panfrost_pool_upload_aligned(&batch->pool,
 796                                                  cb->user_buffer +
 797                                                  cb->buffer_offset,
 798                                                  cb->buffer_size, 16);
 799         } else {
 800                 unreachable("No constant buffer");
 801         }
 802 }
 803
 804 struct sysval_uniform {
 805         union {
 806                 float f[4];
 807                 int32_t i[4];
 808                 uint32_t u[4];
 809                 uint64_t du[2];
 810         };
 811 };
 812
 813 static void
 814 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 815                                       struct sysval_uniform *uniform)
 816 {
 817         struct panfrost_context *ctx = batch->ctx;
 818         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 819
 820         uniform->f[0] = vp->scale[0];
 821         uniform->f[1] = vp->scale[1];
 822         uniform->f[2] = vp->scale[2];
 823 }
 824
 825 static void
 826 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 827                                        struct sysval_uniform *uniform)
 828 {
 829         struct panfrost_context *ctx = batch->ctx;
 830         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 831
 832         uniform->f[0] = vp->translate[0];
 833         uniform->f[1] = vp->translate[1];
 834         uniform->f[2] = vp->translate[2];
 835 }
 836
 837 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 838                                        enum pipe_shader_type st,
 839                                        unsigned int sysvalid,
 840                                        struct sysval_uniform *uniform)
 841 {
 842         struct panfrost_context *ctx = batch->ctx;
 843         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 844         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 845         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 846         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 847
 848         assert(dim);
 849         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 850
 851         if (dim > 1)
 852                 uniform->i[1] = u_minify(tex->texture->height0,
 853                                          tex->u.tex.first_level);
 854
 855         if (dim > 2)
 856                 uniform->i[2] = u_minify(tex->texture->depth0,
 857                                          tex->u.tex.first_level);
 858
 859         if (is_array)
 860                 uniform->i[dim] = tex->texture->array_size;
 861 }
 862
 863 static void
 864 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 865                             enum pipe_shader_type st,
 866                             unsigned ssbo_id,
 867                             struct sysval_uniform *uniform)
 868 {
 869         struct panfrost_context *ctx = batch->ctx;
 870
 871         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 872         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 873
 874         /* Compute address */
 875         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 876
 877         panfrost_batch_add_bo(batch, bo,
 878                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 879                               panfrost_bo_access_for_stage(st));
 880
 881         /* Upload address and size as sysval */
 882         uniform->du[0] = bo->gpu + sb.buffer_offset;
 883         uniform->u[2] = sb.buffer_size;
 884 }
 885
 886 static void
 887 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 888                                enum pipe_shader_type st,
 889                                unsigned samp_idx,
 890                                struct sysval_uniform *uniform)
 891 {
 892         struct panfrost_context *ctx = batch->ctx;
 893         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 894
 895         uniform->f[0] = sampl->min_lod;
 896         uniform->f[1] = sampl->max_lod;
 897         uniform->f[2] = sampl->lod_bias;
 898
 899         /* Even without any errata, Midgard represents "no mipmapping" as
 900          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 901          * panfrost_create_sampler_state which also explains our choice of
 902          * epsilon value (again to keep behaviour consistent) */
 903
 904         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 905                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 906 }
 907
 908 static void
 909 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 910                                        struct sysval_uniform *uniform)
 911 {
 912         struct panfrost_context *ctx = batch->ctx;
 913
 914         uniform->u[0] = ctx->compute_grid->grid[0];
 915         uniform->u[1] = ctx->compute_grid->grid[1];
 916         uniform->u[2] = ctx->compute_grid->grid[2];
 917 }
 918
 919 static void
 920 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 921                         struct panfrost_shader_state *ss,
 922                         enum pipe_shader_type st)
 923 {
 924         struct sysval_uniform *uniforms = (void *)buf;
 925
 926         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 927                 int sysval = ss->sysval[i];
 928
 929                 switch (PAN_SYSVAL_TYPE(sysval)) {
 930                 case PAN_SYSVAL_VIEWPORT_SCALE:
 931                         panfrost_upload_viewport_scale_sysval(batch,
 932                                                               &uniforms[i]);
 933                         break;
 934                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 935                         panfrost_upload_viewport_offset_sysval(batch,
 936                                                                &uniforms[i]);
 937                         break;
 938                 case PAN_SYSVAL_TEXTURE_SIZE:
 939                         panfrost_upload_txs_sysval(batch, st,
 940                                                    PAN_SYSVAL_ID(sysval),
 941                                                    &uniforms[i]);
 942                         break;
 943                 case PAN_SYSVAL_SSBO:
 944                         panfrost_upload_ssbo_sysval(batch, st,
 945                                                     PAN_SYSVAL_ID(sysval),
 946                                                     &uniforms[i]);
 947                         break;
 948                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 949                         panfrost_upload_num_work_groups_sysval(batch,
 950                                                                &uniforms[i]);
 951                         break;
 952                 case PAN_SYSVAL_SAMPLER:
 953                         panfrost_upload_sampler_sysval(batch, st,
 954                                                        PAN_SYSVAL_ID(sysval),
 955                                                        &uniforms[i]);
 956                         break;
 957                 default:
 958                         assert(0);
 959                 }
 960         }
 961 }
 962
 963 static const void *
 964 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
 965                                  unsigned index)
 966 {
 967         struct pipe_constant_buffer *cb = &buf->cb[index];
 968         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 969
 970         if (rsrc)
 971                 return rsrc->bo->cpu;
 972         else if (cb->user_buffer)
 973                 return cb->user_buffer;
 974         else
 975                 unreachable("No constant buffer");
 976 }
 977
 978 void
 979 panfrost_emit_const_buf(struct panfrost_batch *batch,
 980                         enum pipe_shader_type stage,
 981                         struct mali_vertex_tiler_postfix *postfix)
 982 {
 983         struct panfrost_context *ctx = batch->ctx;
 984         struct panfrost_shader_variants *all = ctx->shader[stage];
 985
 986         if (!all)
 987                 return;
 988
 989         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
 990
 991         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
 992
 993         /* Uniforms are implicitly UBO #0 */
 994         bool has_uniforms = buf->enabled_mask & (1 << 0);
 995
 996         /* Allocate room for the sysval and the uniforms */
 997         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
 998         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
 999         size_t size = sys_size + uniform_size;
1000         struct panfrost_transfer transfer =
1001                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1002
1003         /* Upload sysvals requested by the shader */
1004         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1005
1006         /* Upload uniforms */
1007         if (has_uniforms && uniform_size) {
1008                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1009                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1010         }
1011
1012         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1013          * uploaded, so it's always included. The count is the highest UBO
1014          * addressable -- gaps are included. */
1015
1016         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1017
1018         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1019         struct panfrost_transfer ubos =
1020                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1021                                 MALI_UNIFORM_BUFFER_LENGTH);
1022
1023         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1024
1025         /* Upload uniforms as a UBO */
1026
1027         if (size) {
1028                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1029                         cfg.entries = DIV_ROUND_UP(size, 16);
1030                         cfg.pointer = transfer.gpu;
1031                 }
1032         } else {
1033                 *ubo_ptr = 0;
1034         }
1035
1036         /* The rest are honest-to-goodness UBOs */
1037
1038         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1039                 size_t usz = buf->cb[ubo].buffer_size;
1040                 bool enabled = buf->enabled_mask & (1 << ubo);
1041                 bool empty = usz == 0;
1042
1043                 if (!enabled || empty) {
1044                         ubo_ptr[ubo] = 0;
1045                         continue;
1046                 }
1047
1048                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1049                         cfg.entries = DIV_ROUND_UP(usz, 16);
1050                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1051                                         stage, buf, ubo);
1052                 }
1053         }
1054
1055         postfix->uniforms = transfer.gpu;
1056         postfix->uniform_buffers = ubos.gpu;
1057
1058         buf->dirty_mask = 0;
1059 }
1060
1061 mali_ptr
1062 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1063                             const struct pipe_grid_info *info)
1064 {
1065         struct panfrost_context *ctx = batch->ctx;
1066         struct panfrost_device *dev = pan_device(ctx->base.screen);
1067         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1068         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1069         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1070                                                            128));
1071
1072         unsigned log2_instances =
1073                 util_logbase2_ceil(info->grid[0]) +
1074                 util_logbase2_ceil(info->grid[1]) +
1075                 util_logbase2_ceil(info->grid[2]);
1076
1077         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1078         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1079                                                                   shared_size,
1080                                                                   1);
1081
1082         struct mali_shared_memory shared = {
1083                 .shared_memory = bo->gpu,
1084                 .shared_workgroup_count = log2_instances,
1085                 .shared_shift = util_logbase2(single_size) + 1
1086         };
1087
1088         return panfrost_pool_upload_aligned(&batch->pool, &shared,
1089                         sizeof(shared), 64);
1090 }
1091
1092 static mali_ptr
1093 panfrost_get_tex_desc(struct panfrost_batch *batch,
1094                       enum pipe_shader_type st,
1095                       struct panfrost_sampler_view *view)
1096 {
1097         if (!view)
1098                 return (mali_ptr) 0;
1099
1100         struct pipe_sampler_view *pview = &view->base;
1101         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1102
1103         /* Add the BO to the job so it's retained until the job is done. */
1104
1105         panfrost_batch_add_bo(batch, rsrc->bo,
1106                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1107                               panfrost_bo_access_for_stage(st));
1108
1109         panfrost_batch_add_bo(batch, view->bo,
1110                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1111                               panfrost_bo_access_for_stage(st));
1112
1113         return view->bo->gpu;
1114 }
1115
1116 static void
1117 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1118                              struct pipe_context *pctx)
1119 {
1120         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1121         if (view->texture_bo != rsrc->bo->gpu ||
1122             view->modifier != rsrc->modifier) {
1123                 panfrost_bo_unreference(view->bo);
1124                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1125         }
1126 }
1127
1128 mali_ptr
1129 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1130                                   enum pipe_shader_type stage)
1131 {
1132         struct panfrost_context *ctx = batch->ctx;
1133         struct panfrost_device *device = pan_device(ctx->base.screen);
1134
1135         if (!ctx->sampler_view_count[stage])
1136                 return 0;
1137
1138         if (device->quirks & IS_BIFROST) {
1139                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1140                                 MALI_BIFROST_TEXTURE_LENGTH *
1141                                 ctx->sampler_view_count[stage],
1142                                 MALI_BIFROST_TEXTURE_LENGTH);
1143
1144                 struct mali_bifrost_texture_packed *out =
1145                         (struct mali_bifrost_texture_packed *) T.cpu;
1146
1147                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1148                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1149                         struct pipe_sampler_view *pview = &view->base;
1150                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1151
1152                         panfrost_update_sampler_view(view, &ctx->base);
1153                         out[i] = view->bifrost_descriptor;
1154
1155                         /* Add the BOs to the job so they are retained until the job is done. */
1156
1157                         panfrost_batch_add_bo(batch, rsrc->bo,
1158                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1159                                               panfrost_bo_access_for_stage(stage));
1160
1161                         panfrost_batch_add_bo(batch, view->bo,
1162                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1163                                               panfrost_bo_access_for_stage(stage));
1164                 }
1165
1166                 return T.gpu;
1167         } else {
1168                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1169
1170                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1171                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1172
1173                         panfrost_update_sampler_view(view, &ctx->base);
1174
1175                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1176                 }
1177
1178                 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1179                                 sizeof(uint64_t) *
1180                                 ctx->sampler_view_count[stage],
1181                                 sizeof(uint64_t));
1182         }
1183 }
1184
1185 mali_ptr
1186 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1187                                   enum pipe_shader_type stage)
1188 {
1189         struct panfrost_context *ctx = batch->ctx;
1190
1191         if (!ctx->sampler_count[stage])
1192                 return 0;
1193
1194         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1195         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1196
1197         size_t sz = desc_size * ctx->sampler_count[stage];
1198         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1199         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1200
1201         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1202                 out[i] = ctx->samplers[stage][i]->hw;
1203
1204         return T.gpu;
1205 }
1206
1207 void
1208 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1209                           struct mali_vertex_tiler_postfix *vertex_postfix)
1210 {
1211         struct panfrost_context *ctx = batch->ctx;
1212         struct panfrost_vertex_state *so = ctx->vertex;
1213         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1214
1215         unsigned instance_shift = vertex_postfix->instance_shift;
1216         unsigned instance_odd = vertex_postfix->instance_odd;
1217
1218         /* Worst case: everything is NPOT, which is only possible if instancing
1219          * is enabled. Otherwise single record is gauranteed */
1220         bool could_npot = instance_shift || instance_odd;
1221
1222         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1223                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1224                         (could_npot ? 2 : 1),
1225                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1226
1227         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1228                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1229                         MALI_ATTRIBUTE_LENGTH);
1230
1231         struct mali_attribute_buffer_packed *bufs =
1232                 (struct mali_attribute_buffer_packed *) S.cpu;
1233
1234         struct mali_attribute_packed *out =
1235                 (struct mali_attribute_packed *) T.cpu;
1236
1237         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1238         unsigned k = 0;
1239
1240         for (unsigned i = 0; i < so->num_elements; ++i) {
1241                 /* We map buffers 1:1 with the attributes, which
1242                  * means duplicating some vertex buffers (who cares? aside from
1243                  * maybe some caching implications but I somehow doubt that
1244                  * matters) */
1245
1246                 struct pipe_vertex_element *elem = &so->pipe[i];
1247                 unsigned vbi = elem->vertex_buffer_index;
1248                 attrib_to_buffer[i] = k;
1249
1250                 if (!(ctx->vb_mask & (1 << vbi)))
1251                         continue;
1252
1253                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1254                 struct panfrost_resource *rsrc;
1255
1256                 rsrc = pan_resource(buf->buffer.resource);
1257                 if (!rsrc)
1258                         continue;
1259
1260                 /* Add a dependency of the batch on the vertex buffer */
1261                 panfrost_batch_add_bo(batch, rsrc->bo,
1262                                       PAN_BO_ACCESS_SHARED |
1263                                       PAN_BO_ACCESS_READ |
1264                                       PAN_BO_ACCESS_VERTEX_TILER);
1265
1266                 /* Mask off lower bits, see offset fixup below */
1267                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1268                 mali_ptr addr = raw_addr & ~63;
1269
1270                 /* Since we advanced the base pointer, we shrink the buffer
1271                  * size, but add the offset we subtracted */
1272                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1273                         - buf->buffer_offset;
1274
1275                 /* When there is a divisor, the hardware-level divisor is
1276                  * the product of the instance divisor and the padded count */
1277                 unsigned divisor = elem->instance_divisor;
1278                 unsigned hw_divisor = ctx->padded_count * divisor;
1279                 unsigned stride = buf->stride;
1280
1281                 /* If there's a divisor(=1) but no instancing, we want every
1282                  * attribute to be the same */
1283
1284                 if (divisor && ctx->instance_count == 1)
1285                         stride = 0;
1286
1287                 if (!divisor || ctx->instance_count <= 1) {
1288                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1289                                 if (ctx->instance_count > 1)
1290                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1291
1292                                 cfg.pointer = addr;
1293                                 cfg.stride = stride;
1294                                 cfg.size = size;
1295                                 cfg.divisor_r = instance_shift;
1296                                 cfg.divisor_p = instance_odd;
1297                         }
1298                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1299                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1300                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1301                                 cfg.pointer = addr;
1302                                 cfg.stride = stride;
1303                                 cfg.size = size;
1304                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1305                         }
1306
1307                 } else {
1308                         unsigned shift = 0, extra_flags = 0;
1309
1310                         unsigned magic_divisor =
1311                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1312
1313                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1314                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1315                                 cfg.pointer = addr;
1316                                 cfg.stride = stride;
1317                                 cfg.size = size;
1318
1319                                 cfg.divisor_r = shift;
1320                                 cfg.divisor_e = extra_flags;
1321                         }
1322
1323                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1324                                 cfg.divisor_numerator = magic_divisor;
1325                                 cfg.divisor = divisor;
1326                         }
1327
1328                         ++k;
1329                 }
1330
1331                 ++k;
1332         }
1333
1334         /* Add special gl_VertexID/gl_InstanceID buffers */
1335
1336         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1337                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1338
1339                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1340                         cfg.buffer_index = k++;
1341                         cfg.format = so->formats[PAN_VERTEX_ID];
1342                 }
1343
1344                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1345
1346                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1347                         cfg.buffer_index = k++;
1348                         cfg.format = so->formats[PAN_INSTANCE_ID];
1349                 }
1350         }
1351
1352         /* Attribute addresses require 64-byte alignment, so let:
1353          *
1354          *      base' = base & ~63 = base - (base & 63)
1355          *      offset' = offset + (base & 63)
1356          *
1357          * Since base' + offset' = base + offset, these are equivalent
1358          * addressing modes and now base is 64 aligned.
1359          */
1360
1361         unsigned start = vertex_postfix->offset_start;
1362
1363         for (unsigned i = 0; i < so->num_elements; ++i) {
1364                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1365                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1366
1367                 /* Adjust by the masked off bits of the offset. Make sure we
1368                  * read src_offset from so->hw (which is not GPU visible)
1369                  * rather than target (which is) due to caching effects */
1370
1371                 unsigned src_offset = so->pipe[i].src_offset;
1372
1373                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1374                 src_offset += (buf->buffer_offset & 63);
1375
1376                 /* Also, somewhat obscurely per-instance data needs to be
1377                  * offset in response to a delayed start in an indexed draw */
1378
1379                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1380                         src_offset -= buf->stride * start;
1381
1382                 pan_pack(out + i, ATTRIBUTE, cfg) {
1383                         cfg.buffer_index = attrib_to_buffer[i];
1384                         cfg.format = so->formats[i];
1385                         cfg.offset = src_offset;
1386                 }
1387         }
1388
1389         vertex_postfix->attributes = S.gpu;
1390         vertex_postfix->attribute_meta = T.gpu;
1391 }
1392
1393 static mali_ptr
1394 panfrost_emit_varyings(struct panfrost_batch *batch,
1395                 struct mali_attribute_buffer_packed *slot,
1396                 unsigned stride, unsigned count)
1397 {
1398         unsigned size = stride * count;
1399         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1400
1401         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1402                 cfg.stride = stride;
1403                 cfg.size = size;
1404                 cfg.pointer = ptr;
1405         }
1406
1407         return ptr;
1408 }
1409
1410 static unsigned
1411 panfrost_streamout_offset(unsigned stride, unsigned offset,
1412                         struct pipe_stream_output_target *target)
1413 {
1414         return (target->buffer_offset + (offset * stride * 4)) & 63;
1415 }
1416
1417 static void
1418 panfrost_emit_streamout(struct panfrost_batch *batch,
1419                         struct mali_attribute_buffer_packed *slot,
1420                         unsigned stride_words, unsigned offset, unsigned count,
1421                         struct pipe_stream_output_target *target)
1422 {
1423         unsigned stride = stride_words * 4;
1424         unsigned max_size = target->buffer_size;
1425         unsigned expected_size = stride * count;
1426
1427         /* Grab the BO and bind it to the batch */
1428         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1429
1430         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1431          * the perspective of the TILER and FRAGMENT.
1432          */
1433         panfrost_batch_add_bo(batch, bo,
1434                               PAN_BO_ACCESS_SHARED |
1435                               PAN_BO_ACCESS_RW |
1436                               PAN_BO_ACCESS_VERTEX_TILER |
1437                               PAN_BO_ACCESS_FRAGMENT);
1438
1439         /* We will have an offset applied to get alignment */
1440         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1441
1442         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1443                 cfg.pointer = (addr & ~63);
1444                 cfg.stride = stride;
1445                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1446         }
1447 }
1448
1449 static bool
1450 has_point_coord(unsigned mask, gl_varying_slot loc)
1451 {
1452         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1453                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1454         else if (loc == VARYING_SLOT_PNTC)
1455                 return (mask & (1 << 8));
1456         else
1457                 return false;
1458 }
1459
1460 /* Helpers for manipulating stream out information so we can pack varyings
1461  * accordingly. Compute the src_offset for a given captured varying */
1462
1463 static struct pipe_stream_output *
1464 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1465 {
1466         for (unsigned i = 0; i < info->num_outputs; ++i) {
1467                 if (info->output[i].register_index == loc)
1468                         return &info->output[i];
1469         }
1470
1471         unreachable("Varying not captured");
1472 }
1473
1474 static unsigned
1475 pan_varying_size(enum mali_format fmt)
1476 {
1477         unsigned type = MALI_EXTRACT_TYPE(fmt);
1478         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1479         unsigned bits = MALI_EXTRACT_BITS(fmt);
1480         unsigned bpc = 0;
1481
1482         if (bits == MALI_CHANNEL_FLOAT) {
1483                 /* No doubles */
1484                 bool fp16 = (type == MALI_FORMAT_SINT);
1485                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1486
1487                 bpc = fp16 ? 2 : 4;
1488         } else {
1489                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1490
1491                 /* See the enums */
1492                 bits = 1 << bits;
1493                 assert(bits >= 8);
1494                 bpc = bits / 8;
1495         }
1496
1497         return bpc * chan;
1498 }
1499
1500 /* Indices for named (non-XFB) varyings that are present. These are packed
1501  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1502  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1503  * of a given special field given a shift S by:
1504  *
1505  *      idx = popcount(P & ((1 << S) - 1))
1506  *
1507  * That is... look at all of the varyings that come earlier and count them, the
1508  * count is the new index since plus one. Likewise, the total number of special
1509  * buffers required is simply popcount(P)
1510  */
1511
1512 enum pan_special_varying {
1513         PAN_VARY_GENERAL = 0,
1514         PAN_VARY_POSITION = 1,
1515         PAN_VARY_PSIZ = 2,
1516         PAN_VARY_PNTCOORD = 3,
1517         PAN_VARY_FACE = 4,
1518         PAN_VARY_FRAGCOORD = 5,
1519
1520         /* Keep last */
1521         PAN_VARY_MAX,
1522 };
1523
1524 /* Given a varying, figure out which index it correpsonds to */
1525
1526 static inline unsigned
1527 pan_varying_index(unsigned present, enum pan_special_varying v)
1528 {
1529         unsigned mask = (1 << v) - 1;
1530         return util_bitcount(present & mask);
1531 }
1532
1533 /* Get the base offset for XFB buffers, which by convention come after
1534  * everything else. Wrapper function for semantic reasons; by construction this
1535  * is just popcount. */
1536
1537 static inline unsigned
1538 pan_xfb_base(unsigned present)
1539 {
1540         return util_bitcount(present);
1541 }
1542
1543 /* Computes the present mask for varyings so we can start emitting varying records */
1544
1545 static inline unsigned
1546 pan_varying_present(
1547         struct panfrost_shader_state *vs,
1548         struct panfrost_shader_state *fs,
1549         unsigned quirks)
1550 {
1551         /* At the moment we always emit general and position buffers. Not
1552          * strictly necessary but usually harmless */
1553
1554         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1555
1556         /* Enable special buffers by the shader info */
1557
1558         if (vs->writes_point_size)
1559                 present |= (1 << PAN_VARY_PSIZ);
1560
1561         if (fs->reads_point_coord)
1562                 present |= (1 << PAN_VARY_PNTCOORD);
1563
1564         if (fs->reads_face)
1565                 present |= (1 << PAN_VARY_FACE);
1566
1567         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1568                 present |= (1 << PAN_VARY_FRAGCOORD);
1569
1570         /* Also, if we have a point sprite, we need a point coord buffer */
1571
1572         for (unsigned i = 0; i < fs->varying_count; i++)  {
1573                 gl_varying_slot loc = fs->varyings_loc[i];
1574
1575                 if (has_point_coord(fs->point_sprite_mask, loc))
1576                         present |= (1 << PAN_VARY_PNTCOORD);
1577         }
1578
1579         return present;
1580 }
1581
1582 /* Emitters for varying records */
1583
1584 static void
1585 pan_emit_vary(struct mali_attribute_packed *out,
1586                 unsigned present, enum pan_special_varying buf,
1587                 unsigned quirks, enum mali_format format,
1588                 unsigned offset)
1589 {
1590         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1591         unsigned swizzle = quirks & HAS_SWIZZLES ?
1592                         panfrost_get_default_swizzle(nr_channels) :
1593                         panfrost_bifrost_swizzle(nr_channels);
1594
1595         pan_pack(out, ATTRIBUTE, cfg) {
1596                 cfg.buffer_index = pan_varying_index(present, buf);
1597                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1598                 cfg.format = (format << 12) | swizzle;
1599                 cfg.offset = offset;
1600         }
1601 }
1602
1603 /* General varying that is unused */
1604
1605 static void
1606 pan_emit_vary_only(struct mali_attribute_packed *out,
1607                 unsigned present, unsigned quirks)
1608 {
1609         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1610 }
1611
1612 /* Special records */
1613
1614 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1615         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1616         [PAN_VARY_PSIZ]         = MALI_R16F,
1617         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1618         [PAN_VARY_FACE]         = MALI_R32I,
1619         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1620 };
1621
1622 static void
1623 pan_emit_vary_special(struct mali_attribute_packed *out,
1624                 unsigned present, enum pan_special_varying buf,
1625                 unsigned quirks)
1626 {
1627         assert(buf < PAN_VARY_MAX);
1628         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1629 }
1630
1631 static enum mali_format
1632 pan_xfb_format(enum mali_format format, unsigned nr)
1633 {
1634         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1635                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1636         else
1637                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1638 }
1639
1640 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1641  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1642  * value. */
1643
1644 static void
1645 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1646                 unsigned present,
1647                 unsigned max_xfb,
1648                 unsigned *streamout_offsets,
1649                 unsigned quirks,
1650                 enum mali_format format,
1651                 struct pipe_stream_output o)
1652 {
1653         unsigned swizzle = quirks & HAS_SWIZZLES ?
1654                         panfrost_get_default_swizzle(o.num_components) :
1655                         panfrost_bifrost_swizzle(o.num_components);
1656
1657         pan_pack(out, ATTRIBUTE, cfg) {
1658                 /* XFB buffers come after everything else */
1659                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1660                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1661
1662                 /* Override number of channels and precision to highp */
1663                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1664
1665                 /* Apply given offsets together */
1666                 cfg.offset = (o.dst_offset * 4) /* dwords */
1667                         + streamout_offsets[o.output_buffer];
1668         }
1669 }
1670
1671 /* Determine if we should capture a varying for XFB. This requires actually
1672  * having a buffer for it. If we don't capture it, we'll fallback to a general
1673  * varying path (linked or unlinked, possibly discarding the write) */
1674
1675 static bool
1676 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1677                 unsigned loc, unsigned max_xfb)
1678 {
1679         if (!(xfb->so_mask & (1ll << loc)))
1680                 return false;
1681
1682         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1683         return o->output_buffer < max_xfb;
1684 }
1685
1686 static void
1687 pan_emit_general_varying(struct mali_attribute_packed *out,
1688                 struct panfrost_shader_state *other,
1689                 struct panfrost_shader_state *xfb,
1690                 gl_varying_slot loc,
1691                 enum mali_format format,
1692                 unsigned present,
1693                 unsigned quirks,
1694                 unsigned *gen_offsets,
1695                 enum mali_format *gen_formats,
1696                 unsigned *gen_stride,
1697                 unsigned idx,
1698                 bool should_alloc)
1699 {
1700         /* Check if we're linked */
1701         signed other_idx = -1;
1702
1703         for (unsigned j = 0; j < other->varying_count; ++j) {
1704                 if (other->varyings_loc[j] == loc) {
1705                         other_idx = j;
1706                         break;
1707                 }
1708         }
1709
1710         if (other_idx < 0) {
1711                 pan_emit_vary_only(out, present, quirks);
1712                 return;
1713         }
1714
1715         unsigned offset = gen_offsets[other_idx];
1716
1717         if (should_alloc) {
1718                 /* We're linked, so allocate a space via a watermark allocation */
1719                 enum mali_format alt = other->varyings[other_idx];
1720
1721                 /* Do interpolation at minimum precision */
1722                 unsigned size_main = pan_varying_size(format);
1723                 unsigned size_alt = pan_varying_size(alt);
1724                 unsigned size = MIN2(size_main, size_alt);
1725
1726                 /* If a varying is marked for XFB but not actually captured, we
1727                  * should match the format to the format that would otherwise
1728                  * be used for XFB, since dEQP checks for invariance here. It's
1729                  * unclear if this is required by the spec. */
1730
1731                 if (xfb->so_mask & (1ull << loc)) {
1732                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1733                         format = pan_xfb_format(format, o->num_components);
1734                         size = pan_varying_size(format);
1735                 } else if (size == size_alt) {
1736                         format = alt;
1737                 }
1738
1739                 gen_offsets[idx] = *gen_stride;
1740                 gen_formats[other_idx] = format;
1741                 offset = *gen_stride;
1742                 *gen_stride += size;
1743         }
1744
1745         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1746 }
1747
1748 /* Higher-level wrapper around all of the above, classifying a varying into one
1749  * of the above types */
1750
1751 static void
1752 panfrost_emit_varying(
1753                 struct mali_attribute_packed *out,
1754                 struct panfrost_shader_state *stage,
1755                 struct panfrost_shader_state *other,
1756                 struct panfrost_shader_state *xfb,
1757                 unsigned present,
1758                 unsigned max_xfb,
1759                 unsigned *streamout_offsets,
1760                 unsigned quirks,
1761                 unsigned *gen_offsets,
1762                 enum mali_format *gen_formats,
1763                 unsigned *gen_stride,
1764                 unsigned idx,
1765                 bool should_alloc,
1766                 bool is_fragment)
1767 {
1768         gl_varying_slot loc = stage->varyings_loc[idx];
1769         enum mali_format format = stage->varyings[idx];
1770
1771         /* Override format to match linkage */
1772         if (!should_alloc && gen_formats[idx])
1773                 format = gen_formats[idx];
1774
1775         if (has_point_coord(stage->point_sprite_mask, loc)) {
1776                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1777         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1778                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1779                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1780         } else if (loc == VARYING_SLOT_POS) {
1781                 if (is_fragment)
1782                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1783                 else
1784                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1785         } else if (loc == VARYING_SLOT_PSIZ) {
1786                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1787         } else if (loc == VARYING_SLOT_PNTC) {
1788                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1789         } else if (loc == VARYING_SLOT_FACE) {
1790                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1791         } else {
1792                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1793                                 quirks, gen_offsets, gen_formats, gen_stride,
1794                                 idx, should_alloc);
1795         }
1796 }
1797
1798 static void
1799 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1800                 unsigned present,
1801                 enum pan_special_varying v,
1802                 unsigned special)
1803 {
1804         if (present & (1 << v)) {
1805                 unsigned idx = pan_varying_index(present, v);
1806
1807                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1808                         cfg.special = special;
1809                         cfg.type = 0;
1810                 }
1811         }
1812 }
1813
1814 void
1815 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1816                                  unsigned vertex_count,
1817                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1818                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1819                                  union midgard_primitive_size *primitive_size)
1820 {
1821         /* Load the shaders */
1822         struct panfrost_context *ctx = batch->ctx;
1823         struct panfrost_device *dev = pan_device(ctx->base.screen);
1824         struct panfrost_shader_state *vs, *fs;
1825         size_t vs_size, fs_size;
1826
1827         /* Allocate the varying descriptor */
1828
1829         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1830         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1831         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1832         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1833
1834         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1835                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1836
1837         struct pipe_stream_output_info *so = &vs->stream_output;
1838         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1839
1840         /* Check if this varying is linked by us. This is the case for
1841          * general-purpose, non-captured varyings. If it is, link it. If it's
1842          * not, use the provided stream out information to determine the
1843          * offset, since it was already linked for us. */
1844
1845         unsigned gen_offsets[32];
1846         enum mali_format gen_formats[32];
1847         memset(gen_offsets, 0, sizeof(gen_offsets));
1848         memset(gen_formats, 0, sizeof(gen_formats));
1849
1850         unsigned gen_stride = 0;
1851         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1852         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1853
1854         unsigned streamout_offsets[32];
1855
1856         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1857                 streamout_offsets[i] = panfrost_streamout_offset(
1858                                         so->stride[i],
1859                                         ctx->streamout.offsets[i],
1860                                         ctx->streamout.targets[i]);
1861         }
1862
1863         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1864         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1865
1866         for (unsigned i = 0; i < vs->varying_count; i++) {
1867                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1868                                 ctx->streamout.num_targets, streamout_offsets,
1869                                 dev->quirks,
1870                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1871         }
1872
1873         for (unsigned i = 0; i < fs->varying_count; i++) {
1874                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1875                                 ctx->streamout.num_targets, streamout_offsets,
1876                                 dev->quirks,
1877                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1878         }
1879
1880         unsigned xfb_base = pan_xfb_base(present);
1881         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1882                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1883                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1884         struct mali_attribute_buffer_packed *varyings =
1885                 (struct mali_attribute_buffer_packed *) T.cpu;
1886
1887         /* Emit the stream out buffers */
1888
1889         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1890                                                            ctx->vertex_count);
1891
1892         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1893                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1894                                         so->stride[i],
1895                                         ctx->streamout.offsets[i],
1896                                         out_count,
1897                                         ctx->streamout.targets[i]);
1898         }
1899
1900         panfrost_emit_varyings(batch,
1901                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1902                         gen_stride, vertex_count);
1903
1904         /* fp32 vec4 gl_Position */
1905         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1906                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1907                         sizeof(float) * 4, vertex_count);
1908
1909         if (present & (1 << PAN_VARY_PSIZ)) {
1910                 primitive_size->pointer = panfrost_emit_varyings(batch,
1911                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1912                                 2, vertex_count);
1913         }
1914
1915         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1916         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1917         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1918
1919         vertex_postfix->varyings = T.gpu;
1920         tiler_postfix->varyings = T.gpu;
1921
1922         vertex_postfix->varying_meta = trans.gpu;
1923         tiler_postfix->varying_meta = trans.gpu + vs_size;
1924 }
1925
1926 void
1927 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1928                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1929                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1930                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1931                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1932                                 union midgard_primitive_size *primitive_size)
1933 {
1934         struct panfrost_context *ctx = batch->ctx;
1935         struct panfrost_device *device = pan_device(ctx->base.screen);
1936         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1937         struct bifrost_payload_vertex bifrost_vertex = {0,};
1938         struct bifrost_payload_tiler bifrost_tiler = {0,};
1939         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1940         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1941         void *vp, *tp;
1942         size_t vp_size, tp_size;
1943
1944         if (device->quirks & IS_BIFROST) {
1945                 bifrost_vertex.prefix = *vertex_prefix;
1946                 bifrost_vertex.postfix = *vertex_postfix;
1947                 vp = &bifrost_vertex;
1948                 vp_size = sizeof(bifrost_vertex);
1949
1950                 bifrost_tiler.prefix = *tiler_prefix;
1951                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1952                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1953                 bifrost_tiler.postfix = *tiler_postfix;
1954                 tp = &bifrost_tiler;
1955                 tp_size = sizeof(bifrost_tiler);
1956         } else {
1957                 midgard_vertex.prefix = *vertex_prefix;
1958                 midgard_vertex.postfix = *vertex_postfix;
1959                 vp = &midgard_vertex;
1960                 vp_size = sizeof(midgard_vertex);
1961
1962                 midgard_tiler.prefix = *tiler_prefix;
1963                 midgard_tiler.postfix = *tiler_postfix;
1964                 midgard_tiler.primitive_size = *primitive_size;
1965                 tp = &midgard_tiler;
1966                 tp_size = sizeof(midgard_tiler);
1967         }
1968
1969         if (wallpapering) {
1970                 /* Inject in reverse order, with "predicted" job indices.
1971                  * THIS IS A HACK XXX */
1972                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
1973                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
1974                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1975                                  vp, vp_size, true);
1976                 return;
1977         }
1978
1979         /* If rasterizer discard is enable, only submit the vertex */
1980
1981         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1982                                            vp, vp_size, false);
1983
1984         if (ctx->rasterizer->base.rasterizer_discard)
1985                 return;
1986
1987         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
1988                          false);
1989 }
1990
1991 /* TODO: stop hardcoding this */
1992 mali_ptr
1993 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1994 {
1995         uint16_t locations[] = {
1996             128, 128,
1997             0, 256,
1998             0, 256,
1999             0, 256,
2000             0, 256,
2001             0, 256,
2002             0, 256,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             0, 256,
2016             0, 256,
2017             0, 256,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             0, 256,
2023             0, 256,
2024             0, 256,
2025             0, 256,
2026             0, 256,
2027             0, 256,
2028             128, 128,
2029             0, 0,
2030             0, 0,
2031             0, 0,
2032             0, 0,
2033             0, 0,
2034             0, 0,
2035             0, 0,
2036             0, 0,
2037             0, 0,
2038             0, 0,
2039             0, 0,
2040             0, 0,
2041             0, 0,
2042             0, 0,
2043             0, 0,
2044         };
2045
2046         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2047 }