src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 mali_ptr
  55 panfrost_vt_emit_shared_memory(struct panfrost_batch *batch)
  56 {
  57         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
  58
  59         struct mali_shared_memory shared = {
  60                 .shared_workgroup_count = ~0,
  61         };
  62
  63         if (batch->stack_size) {
  64                 struct panfrost_bo *stack =
  65                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  66                                         dev->thread_tls_alloc,
  67                                         dev->core_count);
  68
  69                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  70                 shared.scratchpad = stack->gpu;
  71         }
  72
  73         return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  74 }
  75
  76 void
  77 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
  78                                   struct mali_vertex_tiler_prefix *prefix,
  79                                   union midgard_primitive_size *primitive_size)
  80 {
  81         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
  82
  83         if (!panfrost_writes_point_size(ctx)) {
  84                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
  85                               rasterizer->base.point_size :
  86                               rasterizer->base.line_width;
  87
  88                 primitive_size->constant = val;
  89         }
  90 }
  91
  92 void
  93 panfrost_vt_init(struct panfrost_context *ctx,
  94                  enum pipe_shader_type stage,
  95                  struct mali_vertex_tiler_prefix *prefix,
  96                  struct mali_vertex_tiler_postfix *postfix)
  97 {
  98         struct panfrost_device *device = pan_device(ctx->base.screen);
  99         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 100
 101         if (!ctx->shader[stage])
 102                 return;
 103
 104         memset(prefix, 0, sizeof(*prefix));
 105         memset(postfix, 0, sizeof(*postfix));
 106
 107         if (device->quirks & IS_BIFROST) {
 108                 postfix->gl_enables = 0x2;
 109                 postfix->shared_memory = panfrost_vt_emit_shared_memory(batch);
 110         } else {
 111                 postfix->gl_enables = 0x6;
 112                 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
 113         }
 114
 115         if (stage == PIPE_SHADER_FRAGMENT) {
 116                 if (ctx->occlusion_query) {
 117                         postfix->gl_enables |= MALI_OCCLUSION_QUERY;
 118                         postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 119                         panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 120                                               PAN_BO_ACCESS_SHARED |
 121                                               PAN_BO_ACCESS_RW |
 122                                               PAN_BO_ACCESS_FRAGMENT);
 123                 }
 124
 125                 postfix->gl_enables |= 0x7;
 126                 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 127                 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
 128                         rast->front_ccw);
 129                 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
 130                         (rast->cull_face & PIPE_FACE_FRONT));
 131                 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
 132                         (rast->cull_face & PIPE_FACE_BACK));
 133                 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
 134                         rast->flatshade_first);
 135         }
 136 }
 137
 138 static unsigned
 139 panfrost_translate_index_size(unsigned size)
 140 {
 141         switch (size) {
 142         case 1:
 143                 return MALI_DRAW_INDEXED_UINT8;
 144
 145         case 2:
 146                 return MALI_DRAW_INDEXED_UINT16;
 147
 148         case 4:
 149                 return MALI_DRAW_INDEXED_UINT32;
 150
 151         default:
 152                 unreachable("Invalid index size");
 153         }
 154 }
 155
 156 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 157  * good for the duration of the draw (transient), could last longer. Also get
 158  * the bounds on the index buffer for the range accessed by the draw. We do
 159  * these operations together because there are natural optimizations which
 160  * require them to be together. */
 161
 162 static mali_ptr
 163 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 164                                   const struct pipe_draw_info *info,
 165                                   unsigned *min_index, unsigned *max_index)
 166 {
 167         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 168         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 169         off_t offset = info->start * info->index_size;
 170         bool needs_indices = true;
 171         mali_ptr out = 0;
 172
 173         if (info->max_index != ~0u) {
 174                 *min_index = info->min_index;
 175                 *max_index = info->max_index;
 176                 needs_indices = false;
 177         }
 178
 179         if (!info->has_user_indices) {
 180                 /* Only resources can be directly mapped */
 181                 panfrost_batch_add_bo(batch, rsrc->bo,
 182                                       PAN_BO_ACCESS_SHARED |
 183                                       PAN_BO_ACCESS_READ |
 184                                       PAN_BO_ACCESS_VERTEX_TILER);
 185                 out = rsrc->bo->gpu + offset;
 186
 187                 /* Check the cache */
 188                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 189                                                            info->start,
 190                                                            info->count,
 191                                                            min_index,
 192                                                            max_index);
 193         } else {
 194                 /* Otherwise, we need to upload to transient memory */
 195                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 196                 struct panfrost_transfer T =
 197                         panfrost_pool_alloc_aligned(&batch->pool,
 198                                 info->count * info->index_size,
 199                                 info->index_size);
 200
 201                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 202                 out = T.gpu;
 203         }
 204
 205         if (needs_indices) {
 206                 /* Fallback */
 207                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 208
 209                 if (!info->has_user_indices)
 210                         panfrost_minmax_cache_add(rsrc->index_cache,
 211                                                   info->start, info->count,
 212                                                   *min_index, *max_index);
 213         }
 214
 215         return out;
 216 }
 217
 218 void
 219 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 220                           const struct pipe_draw_info *info,
 221                           enum mali_draw_mode draw_mode,
 222                           struct mali_vertex_tiler_postfix *vertex_postfix,
 223                           struct mali_vertex_tiler_prefix *tiler_prefix,
 224                           struct mali_vertex_tiler_postfix *tiler_postfix,
 225                           unsigned *vertex_count,
 226                           unsigned *padded_count)
 227 {
 228         tiler_prefix->draw_mode = draw_mode;
 229
 230         unsigned draw_flags = 0;
 231
 232         if (panfrost_writes_point_size(ctx))
 233                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 234
 235         if (info->primitive_restart)
 236                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 237
 238         /* These doesn't make much sense */
 239
 240         draw_flags |= 0x3000;
 241
 242         if (info->index_size) {
 243                 unsigned min_index = 0, max_index = 0;
 244
 245                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 246                                                                        info,
 247                                                                        &min_index,
 248                                                                        &max_index);
 249
 250                 /* Use the corresponding values */
 251                 *vertex_count = max_index - min_index + 1;
 252                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 253                 tiler_prefix->offset_bias_correction = -min_index;
 254                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 255                 draw_flags |= panfrost_translate_index_size(info->index_size);
 256         } else {
 257                 tiler_prefix->indices = 0;
 258                 *vertex_count = ctx->vertex_count;
 259                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 260                 tiler_prefix->offset_bias_correction = 0;
 261                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 262         }
 263
 264         tiler_prefix->unknown_draw = draw_flags;
 265         ctx->offset_start = vertex_postfix->offset_start;
 266
 267         /* Encode the padded vertex count */
 268
 269         if (info->instance_count > 1) {
 270                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 271
 272                 unsigned shift = __builtin_ctz(ctx->padded_count);
 273                 unsigned k = ctx->padded_count >> (shift + 1);
 274
 275                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 276                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 277         } else {
 278                 *padded_count = *vertex_count;
 279
 280                 /* Reset instancing state */
 281                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 282                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 283         }
 284 }
 285
 286 static unsigned
 287 translate_tex_wrap(enum pipe_tex_wrap w)
 288 {
 289         switch (w) {
 290         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 291         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 292         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 293         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 294         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 295         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 296         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 297         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 298         default: unreachable("Invalid wrap");
 299         }
 300 }
 301
 302 /* The hardware compares in the wrong order order, so we have to flip before
 303  * encoding. Yes, really. */
 304
 305 static enum mali_func
 306 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 307 {
 308         if (!cso->compare_mode)
 309                 return MALI_FUNC_NEVER;
 310
 311         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 312         return panfrost_flip_compare_func(f);
 313 }
 314
 315 static enum mali_mipmap_mode
 316 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 317 {
 318         switch (f) {
 319         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 320         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 321         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 322         default: unreachable("Invalid");
 323         }
 324 }
 325
 326 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 327                                 struct mali_midgard_sampler_packed *hw)
 328 {
 329         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 330                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 331                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 332                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 333                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 334                 cfg.normalized_coordinates = cso->normalized_coords;
 335
 336                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 337
 338                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 339
 340                 /* If necessary, we disable mipmapping in the sampler descriptor by
 341                  * clamping the LOD as tight as possible (from 0 to epsilon,
 342                  * essentially -- remember these are fixed point numbers, so
 343                  * epsilon=1/256) */
 344
 345                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 346                         cfg.minimum_lod + 1 :
 347                         FIXED_16(cso->max_lod, false);
 348
 349                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 350                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 351                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 352
 353                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 354                 cfg.seamless_cube_map = cso->seamless_cube_map;
 355
 356                 cfg.border_color_r = cso->border_color.f[0];
 357                 cfg.border_color_g = cso->border_color.f[1];
 358                 cfg.border_color_b = cso->border_color.f[2];
 359                 cfg.border_color_a = cso->border_color.f[3];
 360         }
 361 }
 362
 363 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 364                                         struct mali_bifrost_sampler_packed *hw)
 365 {
 366         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 367                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 368                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 369                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 370                 cfg.normalized_coordinates = cso->normalized_coords;
 371
 372                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 373                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 374                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 375
 376                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 377                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 378                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 379
 380                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 381                 cfg.seamless_cube_map = cso->seamless_cube_map;
 382         }
 383 }
 384
 385 static bool
 386 panfrost_fs_required(
 387                 struct panfrost_shader_state *fs,
 388                 struct panfrost_blend_final *blend,
 389                 unsigned rt_count)
 390 {
 391         /* If we generally have side effects */
 392         if (fs->fs_sidefx)
 393                 return true;
 394
 395         /* If colour is written we need to execute */
 396         for (unsigned i = 0; i < rt_count; ++i) {
 397                 if (!blend[i].no_colour)
 398                         return true;
 399         }
 400
 401         /* If depth is written and not implied we need to execute.
 402          * TODO: Predicate on Z/S writes being enabled */
 403         return (fs->writes_depth || fs->writes_stencil);
 404 }
 405
 406 static void
 407 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 408                 struct panfrost_blend_final *blend)
 409 {
 410         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 411         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 412         unsigned rt_count = batch->key.nr_cbufs;
 413
 414         struct bifrost_blend_rt *brts = rts;
 415
 416         /* Disable blending for depth-only */
 417
 418         if (rt_count == 0) {
 419                 if (dev->quirks & IS_BIFROST) {
 420                         memset(brts, 0, sizeof(*brts));
 421                         brts[0].unk2 = 0x3;
 422                 } else {
 423                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 424                                 cfg.equation = 0xf0122122; /* Replace */
 425                         }
 426                 }
 427         }
 428
 429         for (unsigned i = 0; i < rt_count; ++i) {
 430                 struct mali_blend_flags_packed flags = {};
 431
 432                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 433                         if (blend[i].no_colour) {
 434                                 cfg.enable = false;
 435                                 break;
 436                         }
 437
 438                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 439
 440                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 441                         cfg.load_destination = blend[i].load_dest;
 442                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 443
 444                         if (!(dev->quirks & IS_BIFROST))
 445                                 cfg.midgard_blend_shader = blend[i].is_shader;
 446                 }
 447
 448                 if (dev->quirks & IS_BIFROST) {
 449                         memset(brts + i, 0, sizeof(brts[i]));
 450                         brts[i].flags = flags.opaque[0];
 451
 452                         if (blend[i].is_shader) {
 453                                 /* The blend shader's address needs to be at
 454                                  * the same top 32 bit as the fragment shader.
 455                                  * TODO: Ensure that's always the case.
 456                                  */
 457                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 458                                        (fs->bo->gpu & (0xffffffffull << 32)));
 459                                 brts[i].shader = blend[i].shader.gpu;
 460                                 brts[i].unk2 = 0x0;
 461                         } else {
 462                                 enum pipe_format format = batch->key.cbufs[i]->format;
 463                                 const struct util_format_description *format_desc;
 464                                 format_desc = util_format_description(format);
 465
 466                                 brts[i].equation = blend[i].equation.equation;
 467
 468                                 /* TODO: this is a bit more complicated */
 469                                 brts[i].constant = blend[i].equation.constant;
 470
 471                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 472
 473                                 /* 0x19 disables blending and forces REPLACE
 474                                  * mode (equivalent to rgb_mode = alpha_mode =
 475                                  * x122, colour mask = 0xF). 0x1a allows
 476                                  * blending. */
 477                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 478
 479                                 brts[i].shader_type = fs->blend_types[i];
 480                         }
 481                 } else {
 482                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 483                                 cfg.flags = flags;
 484
 485                                 if (blend[i].is_shader) {
 486                                         cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 487                                 } else {
 488                                         cfg.equation = blend[i].equation.equation.opaque[0];
 489                                         cfg.constant = blend[i].equation.constant;
 490                                 }
 491                         }
 492
 493                         rts += MALI_MIDGARD_BLEND_LENGTH;
 494                 }
 495         }
 496 }
 497
 498 static void
 499 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 500                                struct mali_state_packed *fragmeta,
 501                                struct panfrost_blend_final *blend)
 502 {
 503         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 504         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 505         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 506         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 507         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 508         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 509
 510         /* Built up here */
 511         struct mali_shader_packed shader = fs->shader;
 512         struct mali_preload_packed preload = fs->preload;
 513         uint32_t properties;
 514         struct mali_multisample_misc_packed multisample_misc;
 515         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 516         union midgard_blend sfbd_blend = { 0 };
 517
 518         if (!panfrost_fs_required(fs, blend, rt_count)) {
 519                 if (dev->quirks & IS_BIFROST) {
 520                         pan_pack(&shader, SHADER, cfg) {}
 521
 522                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 523                                 cfg.unknown = 0x950020; /* XXX */
 524                                 cfg.early_z_enable = true;
 525                         }
 526
 527                         preload.opaque[0] = 0;
 528                 } else {
 529                         pan_pack(&shader, SHADER, cfg) {
 530                                 cfg.shader = 0x1;
 531                         }
 532
 533                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 534                                 cfg.work_register_count = 1;
 535                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 536                                 cfg.early_z_enable = true;
 537                         }
 538                 }
 539         } else if (dev->quirks & IS_BIFROST) {
 540                 bool no_blend = true;
 541
 542                 for (unsigned i = 0; i < rt_count; ++i)
 543                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 544
 545                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 546                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 547                 }
 548
 549                 /* Combine with prepacked properties */
 550                 properties |= fs->properties.opaque[0];
 551         } else {
 552                 /* Reasons to disable early-Z from a shader perspective */
 553                 bool late_z = fs->can_discard || fs->writes_global ||
 554                         fs->writes_depth || fs->writes_stencil;
 555
 556                 /* If either depth or stencil is enabled, discard matters */
 557                 bool zs_enabled =
 558                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 559                         zsa->base.stencil[0].enabled;
 560
 561                 bool has_blend_shader = false;
 562
 563                 for (unsigned c = 0; c < rt_count; ++c)
 564                         has_blend_shader |= blend[c].is_shader;
 565
 566                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 567                         /* TODO: Reduce this limit? */
 568                         if (has_blend_shader)
 569                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 570                         else
 571                                 cfg.work_register_count = fs->work_reg_count;
 572
 573                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 574                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 575                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 576                 }
 577
 578                 properties |= fs->properties.opaque[0];
 579         }
 580
 581         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 582                 bool msaa = rast->multisample;
 583                 cfg.multisample_enable = msaa;
 584                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 585
 586                 /* EXT_shader_framebuffer_fetch requires per-sample */
 587                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 588                 cfg.evaluate_per_sample = msaa && per_sample;
 589
 590                 if (dev->quirks & MIDGARD_SFBD) {
 591                         cfg.sfbd_load_destination = blend[0].load_dest;
 592                         cfg.sfbd_blend_shader = blend[0].is_shader;
 593                 }
 594
 595                 cfg.depth_function = zsa->base.depth.enabled ?
 596                         panfrost_translate_compare_func(zsa->base.depth.func) :
 597                         MALI_FUNC_ALWAYS;
 598
 599                 cfg.depth_write_mask = zsa->base.depth.writemask;
 600                 cfg.near_discard = rast->depth_clip_near;
 601                 cfg.far_discard = rast->depth_clip_far;
 602                 cfg.unknown_2 = true;
 603         }
 604
 605         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 606                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 607                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 608                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 609                 cfg.alpha_to_coverage = alpha_to_coverage;
 610
 611                 if (dev->quirks & MIDGARD_SFBD) {
 612                         cfg.sfbd_write_enable = !blend[0].no_colour;
 613                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 614                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 615                 }
 616
 617                 cfg.unknown_1 = 0x7;
 618                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 619                 cfg.single_sampled_lines = !rast->multisample;
 620         }
 621
 622         if (dev->quirks & MIDGARD_SFBD) {
 623                 if (blend[0].is_shader) {
 624                         sfbd_blend.shader = blend[0].shader.gpu |
 625                                 blend[0].shader.first_tag;
 626                 } else {
 627                         sfbd_blend.equation = blend[0].equation.equation;
 628                         sfbd_blend.constant = blend[0].equation.constant;
 629                 }
 630         } else if (!(dev->quirks & IS_BIFROST)) {
 631                 /* Bug where MRT-capable hw apparently reads the last blend
 632                  * shader from here instead of the usual location? */
 633
 634                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 635                         if (!blend[rt].is_shader)
 636                                 continue;
 637
 638                         sfbd_blend.shader = blend[rt].shader.gpu |
 639                                                  blend[rt].shader.first_tag;
 640                         break;
 641                 }
 642         }
 643
 644         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 645                 cfg.shader = fs->shader;
 646                 cfg.properties = properties;
 647                 cfg.depth_units = rast->offset_units * 2.0f;
 648                 cfg.depth_factor = rast->offset_scale;
 649                 cfg.multisample_misc = multisample_misc;
 650                 cfg.stencil_mask_misc = stencil_mask_misc;
 651
 652                 cfg.stencil_front = zsa->stencil_front;
 653                 cfg.stencil_back = zsa->stencil_back;
 654
 655                 /* Bottom bits for stencil ref, exactly one word */
 656                 bool back_enab = zsa->base.stencil[1].enabled;
 657                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 658                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 659
 660                 if (dev->quirks & IS_BIFROST)
 661                         cfg.preload = preload;
 662                 else
 663                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 664         }
 665 }
 666
 667 mali_ptr
 668 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 669 {
 670         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 671
 672         panfrost_batch_add_bo(batch, ss->bo,
 673                               PAN_BO_ACCESS_PRIVATE |
 674                               PAN_BO_ACCESS_READ |
 675                               PAN_BO_ACCESS_VERTEX_TILER);
 676
 677         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 678                               PAN_BO_ACCESS_PRIVATE |
 679                               PAN_BO_ACCESS_READ |
 680                               PAN_BO_ACCESS_VERTEX_TILER);
 681
 682         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 683 }
 684
 685 mali_ptr
 686 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 687 {
 688         struct panfrost_context *ctx = batch->ctx;
 689         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 690
 691         /* Add the shader BO to the batch. */
 692         panfrost_batch_add_bo(batch, ss->bo,
 693                               PAN_BO_ACCESS_PRIVATE |
 694                               PAN_BO_ACCESS_READ |
 695                               PAN_BO_ACCESS_FRAGMENT);
 696
 697         struct panfrost_device *dev = pan_device(ctx->base.screen);
 698         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 699         struct panfrost_transfer xfer;
 700         unsigned rt_size;
 701
 702         if (dev->quirks & MIDGARD_SFBD)
 703                 rt_size = 0;
 704         else if (dev->quirks & IS_BIFROST)
 705                 rt_size = sizeof(struct bifrost_blend_rt);
 706         else
 707                 rt_size = sizeof(struct midgard_blend_rt);
 708
 709         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 710         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 711
 712         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 713
 714         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 715                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 716
 717         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 718
 719         if (!(dev->quirks & MIDGARD_SFBD))
 720                 panfrost_emit_blend(batch, xfer.cpu + MALI_STATE_LENGTH, blend);
 721         else
 722                 batch->draws |= PIPE_CLEAR_COLOR0;
 723
 724         return xfer.gpu;
 725 }
 726
 727 mali_ptr
 728 panfrost_emit_viewport(struct panfrost_batch *batch)
 729 {
 730         struct panfrost_context *ctx = batch->ctx;
 731         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 732         const struct pipe_scissor_state *ss = &ctx->scissor;
 733         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 734         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 735
 736         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 737          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 738          * translate + |scale|, so the ordering is correct here. */
 739         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 740         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 741         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 742         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 743         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 744         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 745
 746         /* Scissor to the intersection of viewport and to the scissor, clamped
 747          * to the framebuffer */
 748
 749         unsigned minx = MIN2(fb->width, vp_minx);
 750         unsigned maxx = MIN2(fb->width, vp_maxx);
 751         unsigned miny = MIN2(fb->height, vp_miny);
 752         unsigned maxy = MIN2(fb->height, vp_maxy);
 753
 754         if (ss && rast->scissor) {
 755                 minx = MAX2(ss->minx, minx);
 756                 miny = MAX2(ss->miny, miny);
 757                 maxx = MIN2(ss->maxx, maxx);
 758                 maxy = MIN2(ss->maxy, maxy);
 759         }
 760
 761         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 762
 763         pan_pack(T.cpu, VIEWPORT, cfg) {
 764                 cfg.scissor_minimum_x = minx;
 765                 cfg.scissor_minimum_y = miny;
 766                 cfg.scissor_maximum_x = maxx - 1;
 767                 cfg.scissor_maximum_y = maxy - 1;
 768
 769                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 770                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 771         }
 772
 773         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 774         return T.gpu;
 775 }
 776
 777 static mali_ptr
 778 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 779                                  enum pipe_shader_type st,
 780                                  struct panfrost_constant_buffer *buf,
 781                                  unsigned index)
 782 {
 783         struct pipe_constant_buffer *cb = &buf->cb[index];
 784         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 785
 786         if (rsrc) {
 787                 panfrost_batch_add_bo(batch, rsrc->bo,
 788                                       PAN_BO_ACCESS_SHARED |
 789                                       PAN_BO_ACCESS_READ |
 790                                       panfrost_bo_access_for_stage(st));
 791
 792                 /* Alignment gauranteed by
 793                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 794                 return rsrc->bo->gpu + cb->buffer_offset;
 795         } else if (cb->user_buffer) {
 796                 return panfrost_pool_upload_aligned(&batch->pool,
 797                                                  cb->user_buffer +
 798                                                  cb->buffer_offset,
 799                                                  cb->buffer_size, 16);
 800         } else {
 801                 unreachable("No constant buffer");
 802         }
 803 }
 804
 805 struct sysval_uniform {
 806         union {
 807                 float f[4];
 808                 int32_t i[4];
 809                 uint32_t u[4];
 810                 uint64_t du[2];
 811         };
 812 };
 813
 814 static void
 815 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 816                                       struct sysval_uniform *uniform)
 817 {
 818         struct panfrost_context *ctx = batch->ctx;
 819         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 820
 821         uniform->f[0] = vp->scale[0];
 822         uniform->f[1] = vp->scale[1];
 823         uniform->f[2] = vp->scale[2];
 824 }
 825
 826 static void
 827 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 828                                        struct sysval_uniform *uniform)
 829 {
 830         struct panfrost_context *ctx = batch->ctx;
 831         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 832
 833         uniform->f[0] = vp->translate[0];
 834         uniform->f[1] = vp->translate[1];
 835         uniform->f[2] = vp->translate[2];
 836 }
 837
 838 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 839                                        enum pipe_shader_type st,
 840                                        unsigned int sysvalid,
 841                                        struct sysval_uniform *uniform)
 842 {
 843         struct panfrost_context *ctx = batch->ctx;
 844         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 845         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 846         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 847         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 848
 849         assert(dim);
 850         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 851
 852         if (dim > 1)
 853                 uniform->i[1] = u_minify(tex->texture->height0,
 854                                          tex->u.tex.first_level);
 855
 856         if (dim > 2)
 857                 uniform->i[2] = u_minify(tex->texture->depth0,
 858                                          tex->u.tex.first_level);
 859
 860         if (is_array)
 861                 uniform->i[dim] = tex->texture->array_size;
 862 }
 863
 864 static void
 865 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 866                             enum pipe_shader_type st,
 867                             unsigned ssbo_id,
 868                             struct sysval_uniform *uniform)
 869 {
 870         struct panfrost_context *ctx = batch->ctx;
 871
 872         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 873         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 874
 875         /* Compute address */
 876         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 877
 878         panfrost_batch_add_bo(batch, bo,
 879                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 880                               panfrost_bo_access_for_stage(st));
 881
 882         /* Upload address and size as sysval */
 883         uniform->du[0] = bo->gpu + sb.buffer_offset;
 884         uniform->u[2] = sb.buffer_size;
 885 }
 886
 887 static void
 888 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 889                                enum pipe_shader_type st,
 890                                unsigned samp_idx,
 891                                struct sysval_uniform *uniform)
 892 {
 893         struct panfrost_context *ctx = batch->ctx;
 894         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 895
 896         uniform->f[0] = sampl->min_lod;
 897         uniform->f[1] = sampl->max_lod;
 898         uniform->f[2] = sampl->lod_bias;
 899
 900         /* Even without any errata, Midgard represents "no mipmapping" as
 901          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 902          * panfrost_create_sampler_state which also explains our choice of
 903          * epsilon value (again to keep behaviour consistent) */
 904
 905         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 906                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 907 }
 908
 909 static void
 910 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 911                                        struct sysval_uniform *uniform)
 912 {
 913         struct panfrost_context *ctx = batch->ctx;
 914
 915         uniform->u[0] = ctx->compute_grid->grid[0];
 916         uniform->u[1] = ctx->compute_grid->grid[1];
 917         uniform->u[2] = ctx->compute_grid->grid[2];
 918 }
 919
 920 static void
 921 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 922                         struct panfrost_shader_state *ss,
 923                         enum pipe_shader_type st)
 924 {
 925         struct sysval_uniform *uniforms = (void *)buf;
 926
 927         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 928                 int sysval = ss->sysval[i];
 929
 930                 switch (PAN_SYSVAL_TYPE(sysval)) {
 931                 case PAN_SYSVAL_VIEWPORT_SCALE:
 932                         panfrost_upload_viewport_scale_sysval(batch,
 933                                                               &uniforms[i]);
 934                         break;
 935                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 936                         panfrost_upload_viewport_offset_sysval(batch,
 937                                                                &uniforms[i]);
 938                         break;
 939                 case PAN_SYSVAL_TEXTURE_SIZE:
 940                         panfrost_upload_txs_sysval(batch, st,
 941                                                    PAN_SYSVAL_ID(sysval),
 942                                                    &uniforms[i]);
 943                         break;
 944                 case PAN_SYSVAL_SSBO:
 945                         panfrost_upload_ssbo_sysval(batch, st,
 946                                                     PAN_SYSVAL_ID(sysval),
 947                                                     &uniforms[i]);
 948                         break;
 949                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 950                         panfrost_upload_num_work_groups_sysval(batch,
 951                                                                &uniforms[i]);
 952                         break;
 953                 case PAN_SYSVAL_SAMPLER:
 954                         panfrost_upload_sampler_sysval(batch, st,
 955                                                        PAN_SYSVAL_ID(sysval),
 956                                                        &uniforms[i]);
 957                         break;
 958                 default:
 959                         assert(0);
 960                 }
 961         }
 962 }
 963
 964 static const void *
 965 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
 966                                  unsigned index)
 967 {
 968         struct pipe_constant_buffer *cb = &buf->cb[index];
 969         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 970
 971         if (rsrc)
 972                 return rsrc->bo->cpu;
 973         else if (cb->user_buffer)
 974                 return cb->user_buffer;
 975         else
 976                 unreachable("No constant buffer");
 977 }
 978
 979 mali_ptr
 980 panfrost_emit_const_buf(struct panfrost_batch *batch,
 981                         enum pipe_shader_type stage,
 982                         mali_ptr *push_constants)
 983 {
 984         struct panfrost_context *ctx = batch->ctx;
 985         struct panfrost_shader_variants *all = ctx->shader[stage];
 986
 987         if (!all)
 988                 return 0;
 989
 990         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
 991
 992         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
 993
 994         /* Uniforms are implicitly UBO #0 */
 995         bool has_uniforms = buf->enabled_mask & (1 << 0);
 996
 997         /* Allocate room for the sysval and the uniforms */
 998         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
 999         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1000         size_t size = sys_size + uniform_size;
1001         struct panfrost_transfer transfer =
1002                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1003
1004         /* Upload sysvals requested by the shader */
1005         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1006
1007         /* Upload uniforms */
1008         if (has_uniforms && uniform_size) {
1009                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1010                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1011         }
1012
1013         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1014          * uploaded, so it's always included. The count is the highest UBO
1015          * addressable -- gaps are included. */
1016
1017         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1018
1019         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1020         struct panfrost_transfer ubos =
1021                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1022                                 MALI_UNIFORM_BUFFER_LENGTH);
1023
1024         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1025
1026         /* Upload uniforms as a UBO */
1027
1028         if (size) {
1029                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1030                         cfg.entries = DIV_ROUND_UP(size, 16);
1031                         cfg.pointer = transfer.gpu;
1032                 }
1033         } else {
1034                 *ubo_ptr = 0;
1035         }
1036
1037         /* The rest are honest-to-goodness UBOs */
1038
1039         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1040                 size_t usz = buf->cb[ubo].buffer_size;
1041                 bool enabled = buf->enabled_mask & (1 << ubo);
1042                 bool empty = usz == 0;
1043
1044                 if (!enabled || empty) {
1045                         ubo_ptr[ubo] = 0;
1046                         continue;
1047                 }
1048
1049                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1050                         cfg.entries = DIV_ROUND_UP(usz, 16);
1051                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1052                                         stage, buf, ubo);
1053                 }
1054         }
1055
1056         *push_constants = transfer.gpu;
1057
1058         buf->dirty_mask = 0;
1059         return ubos.gpu;
1060 }
1061
1062 mali_ptr
1063 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1064                             const struct pipe_grid_info *info)
1065 {
1066         struct panfrost_context *ctx = batch->ctx;
1067         struct panfrost_device *dev = pan_device(ctx->base.screen);
1068         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1069         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1070         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1071                                                            128));
1072
1073         unsigned log2_instances =
1074                 util_logbase2_ceil(info->grid[0]) +
1075                 util_logbase2_ceil(info->grid[1]) +
1076                 util_logbase2_ceil(info->grid[2]);
1077
1078         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1079         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1080                                                                   shared_size,
1081                                                                   1);
1082
1083         struct mali_shared_memory shared = {
1084                 .shared_memory = bo->gpu,
1085                 .shared_workgroup_count = log2_instances,
1086                 .shared_shift = util_logbase2(single_size) + 1
1087         };
1088
1089         return panfrost_pool_upload_aligned(&batch->pool, &shared,
1090                         sizeof(shared), 64);
1091 }
1092
1093 static mali_ptr
1094 panfrost_get_tex_desc(struct panfrost_batch *batch,
1095                       enum pipe_shader_type st,
1096                       struct panfrost_sampler_view *view)
1097 {
1098         if (!view)
1099                 return (mali_ptr) 0;
1100
1101         struct pipe_sampler_view *pview = &view->base;
1102         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1103
1104         /* Add the BO to the job so it's retained until the job is done. */
1105
1106         panfrost_batch_add_bo(batch, rsrc->bo,
1107                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1108                               panfrost_bo_access_for_stage(st));
1109
1110         panfrost_batch_add_bo(batch, view->bo,
1111                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1112                               panfrost_bo_access_for_stage(st));
1113
1114         return view->bo->gpu;
1115 }
1116
1117 static void
1118 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1119                              struct pipe_context *pctx)
1120 {
1121         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1122         if (view->texture_bo != rsrc->bo->gpu ||
1123             view->modifier != rsrc->modifier) {
1124                 panfrost_bo_unreference(view->bo);
1125                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1126         }
1127 }
1128
1129 mali_ptr
1130 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1131                                   enum pipe_shader_type stage)
1132 {
1133         struct panfrost_context *ctx = batch->ctx;
1134         struct panfrost_device *device = pan_device(ctx->base.screen);
1135
1136         if (!ctx->sampler_view_count[stage])
1137                 return 0;
1138
1139         if (device->quirks & IS_BIFROST) {
1140                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1141                                 MALI_BIFROST_TEXTURE_LENGTH *
1142                                 ctx->sampler_view_count[stage],
1143                                 MALI_BIFROST_TEXTURE_LENGTH);
1144
1145                 struct mali_bifrost_texture_packed *out =
1146                         (struct mali_bifrost_texture_packed *) T.cpu;
1147
1148                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1149                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1150                         struct pipe_sampler_view *pview = &view->base;
1151                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1152
1153                         panfrost_update_sampler_view(view, &ctx->base);
1154                         out[i] = view->bifrost_descriptor;
1155
1156                         /* Add the BOs to the job so they are retained until the job is done. */
1157
1158                         panfrost_batch_add_bo(batch, rsrc->bo,
1159                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1160                                               panfrost_bo_access_for_stage(stage));
1161
1162                         panfrost_batch_add_bo(batch, view->bo,
1163                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1164                                               panfrost_bo_access_for_stage(stage));
1165                 }
1166
1167                 return T.gpu;
1168         } else {
1169                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1170
1171                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1172                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1173
1174                         panfrost_update_sampler_view(view, &ctx->base);
1175
1176                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1177                 }
1178
1179                 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1180                                 sizeof(uint64_t) *
1181                                 ctx->sampler_view_count[stage],
1182                                 sizeof(uint64_t));
1183         }
1184 }
1185
1186 mali_ptr
1187 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1188                                   enum pipe_shader_type stage)
1189 {
1190         struct panfrost_context *ctx = batch->ctx;
1191
1192         if (!ctx->sampler_count[stage])
1193                 return 0;
1194
1195         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1196         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1197
1198         size_t sz = desc_size * ctx->sampler_count[stage];
1199         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1200         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1201
1202         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1203                 out[i] = ctx->samplers[stage][i]->hw;
1204
1205         return T.gpu;
1206 }
1207
1208 mali_ptr
1209 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1210                           mali_ptr *buffers)
1211 {
1212         struct panfrost_context *ctx = batch->ctx;
1213         struct panfrost_vertex_state *so = ctx->vertex;
1214         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1215
1216         /* Worst case: everything is NPOT, which is only possible if instancing
1217          * is enabled. Otherwise single record is gauranteed */
1218         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1219                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1220                         (ctx->instance_count > 1 ? 2 : 1),
1221                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1222
1223         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1224                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1225                         MALI_ATTRIBUTE_LENGTH);
1226
1227         struct mali_attribute_buffer_packed *bufs =
1228                 (struct mali_attribute_buffer_packed *) S.cpu;
1229
1230         struct mali_attribute_packed *out =
1231                 (struct mali_attribute_packed *) T.cpu;
1232
1233         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1234         unsigned k = 0;
1235
1236         for (unsigned i = 0; i < so->num_elements; ++i) {
1237                 /* We map buffers 1:1 with the attributes, which
1238                  * means duplicating some vertex buffers (who cares? aside from
1239                  * maybe some caching implications but I somehow doubt that
1240                  * matters) */
1241
1242                 struct pipe_vertex_element *elem = &so->pipe[i];
1243                 unsigned vbi = elem->vertex_buffer_index;
1244                 attrib_to_buffer[i] = k;
1245
1246                 if (!(ctx->vb_mask & (1 << vbi)))
1247                         continue;
1248
1249                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1250                 struct panfrost_resource *rsrc;
1251
1252                 rsrc = pan_resource(buf->buffer.resource);
1253                 if (!rsrc)
1254                         continue;
1255
1256                 /* Add a dependency of the batch on the vertex buffer */
1257                 panfrost_batch_add_bo(batch, rsrc->bo,
1258                                       PAN_BO_ACCESS_SHARED |
1259                                       PAN_BO_ACCESS_READ |
1260                                       PAN_BO_ACCESS_VERTEX_TILER);
1261
1262                 /* Mask off lower bits, see offset fixup below */
1263                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1264                 mali_ptr addr = raw_addr & ~63;
1265
1266                 /* Since we advanced the base pointer, we shrink the buffer
1267                  * size, but add the offset we subtracted */
1268                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1269                         - buf->buffer_offset;
1270
1271                 /* When there is a divisor, the hardware-level divisor is
1272                  * the product of the instance divisor and the padded count */
1273                 unsigned divisor = elem->instance_divisor;
1274                 unsigned hw_divisor = ctx->padded_count * divisor;
1275                 unsigned stride = buf->stride;
1276
1277                 /* If there's a divisor(=1) but no instancing, we want every
1278                  * attribute to be the same */
1279
1280                 if (divisor && ctx->instance_count == 1)
1281                         stride = 0;
1282
1283                 if (!divisor || ctx->instance_count <= 1) {
1284                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1285                                 if (ctx->instance_count > 1) {
1286                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1287                                         cfg.divisor = ctx->padded_count;
1288                                 }
1289
1290                                 cfg.pointer = addr;
1291                                 cfg.stride = stride;
1292                                 cfg.size = size;
1293                         }
1294                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1295                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1296                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1297                                 cfg.pointer = addr;
1298                                 cfg.stride = stride;
1299                                 cfg.size = size;
1300                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1301                         }
1302
1303                 } else {
1304                         unsigned shift = 0, extra_flags = 0;
1305
1306                         unsigned magic_divisor =
1307                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1308
1309                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1310                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1311                                 cfg.pointer = addr;
1312                                 cfg.stride = stride;
1313                                 cfg.size = size;
1314
1315                                 cfg.divisor_r = shift;
1316                                 cfg.divisor_e = extra_flags;
1317                         }
1318
1319                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1320                                 cfg.divisor_numerator = magic_divisor;
1321                                 cfg.divisor = divisor;
1322                         }
1323
1324                         ++k;
1325                 }
1326
1327                 ++k;
1328         }
1329
1330         /* Add special gl_VertexID/gl_InstanceID buffers */
1331
1332         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1333                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1334
1335                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1336                         cfg.buffer_index = k++;
1337                         cfg.format = so->formats[PAN_VERTEX_ID];
1338                 }
1339
1340                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1341
1342                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1343                         cfg.buffer_index = k++;
1344                         cfg.format = so->formats[PAN_INSTANCE_ID];
1345                 }
1346         }
1347
1348         /* Attribute addresses require 64-byte alignment, so let:
1349          *
1350          *      base' = base & ~63 = base - (base & 63)
1351          *      offset' = offset + (base & 63)
1352          *
1353          * Since base' + offset' = base + offset, these are equivalent
1354          * addressing modes and now base is 64 aligned.
1355          */
1356
1357         for (unsigned i = 0; i < so->num_elements; ++i) {
1358                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1359                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1360
1361                 /* Adjust by the masked off bits of the offset. Make sure we
1362                  * read src_offset from so->hw (which is not GPU visible)
1363                  * rather than target (which is) due to caching effects */
1364
1365                 unsigned src_offset = so->pipe[i].src_offset;
1366
1367                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1368                 src_offset += (buf->buffer_offset & 63);
1369
1370                 /* Also, somewhat obscurely per-instance data needs to be
1371                  * offset in response to a delayed start in an indexed draw */
1372
1373                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1374                         src_offset -= buf->stride * ctx->offset_start;
1375
1376                 pan_pack(out + i, ATTRIBUTE, cfg) {
1377                         cfg.buffer_index = attrib_to_buffer[i];
1378                         cfg.format = so->formats[i];
1379                         cfg.offset = src_offset;
1380                 }
1381         }
1382
1383         *buffers = S.gpu;
1384         return T.gpu;
1385 }
1386
1387 static mali_ptr
1388 panfrost_emit_varyings(struct panfrost_batch *batch,
1389                 struct mali_attribute_buffer_packed *slot,
1390                 unsigned stride, unsigned count)
1391 {
1392         unsigned size = stride * count;
1393         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1394
1395         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1396                 cfg.stride = stride;
1397                 cfg.size = size;
1398                 cfg.pointer = ptr;
1399         }
1400
1401         return ptr;
1402 }
1403
1404 static unsigned
1405 panfrost_streamout_offset(unsigned stride, unsigned offset,
1406                         struct pipe_stream_output_target *target)
1407 {
1408         return (target->buffer_offset + (offset * stride * 4)) & 63;
1409 }
1410
1411 static void
1412 panfrost_emit_streamout(struct panfrost_batch *batch,
1413                         struct mali_attribute_buffer_packed *slot,
1414                         unsigned stride_words, unsigned offset, unsigned count,
1415                         struct pipe_stream_output_target *target)
1416 {
1417         unsigned stride = stride_words * 4;
1418         unsigned max_size = target->buffer_size;
1419         unsigned expected_size = stride * count;
1420
1421         /* Grab the BO and bind it to the batch */
1422         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1423
1424         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1425          * the perspective of the TILER and FRAGMENT.
1426          */
1427         panfrost_batch_add_bo(batch, bo,
1428                               PAN_BO_ACCESS_SHARED |
1429                               PAN_BO_ACCESS_RW |
1430                               PAN_BO_ACCESS_VERTEX_TILER |
1431                               PAN_BO_ACCESS_FRAGMENT);
1432
1433         /* We will have an offset applied to get alignment */
1434         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1435
1436         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1437                 cfg.pointer = (addr & ~63);
1438                 cfg.stride = stride;
1439                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1440         }
1441 }
1442
1443 static bool
1444 has_point_coord(unsigned mask, gl_varying_slot loc)
1445 {
1446         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1447                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1448         else if (loc == VARYING_SLOT_PNTC)
1449                 return (mask & (1 << 8));
1450         else
1451                 return false;
1452 }
1453
1454 /* Helpers for manipulating stream out information so we can pack varyings
1455  * accordingly. Compute the src_offset for a given captured varying */
1456
1457 static struct pipe_stream_output *
1458 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1459 {
1460         for (unsigned i = 0; i < info->num_outputs; ++i) {
1461                 if (info->output[i].register_index == loc)
1462                         return &info->output[i];
1463         }
1464
1465         unreachable("Varying not captured");
1466 }
1467
1468 static unsigned
1469 pan_varying_size(enum mali_format fmt)
1470 {
1471         unsigned type = MALI_EXTRACT_TYPE(fmt);
1472         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1473         unsigned bits = MALI_EXTRACT_BITS(fmt);
1474         unsigned bpc = 0;
1475
1476         if (bits == MALI_CHANNEL_FLOAT) {
1477                 /* No doubles */
1478                 bool fp16 = (type == MALI_FORMAT_SINT);
1479                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1480
1481                 bpc = fp16 ? 2 : 4;
1482         } else {
1483                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1484
1485                 /* See the enums */
1486                 bits = 1 << bits;
1487                 assert(bits >= 8);
1488                 bpc = bits / 8;
1489         }
1490
1491         return bpc * chan;
1492 }
1493
1494 /* Indices for named (non-XFB) varyings that are present. These are packed
1495  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1496  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1497  * of a given special field given a shift S by:
1498  *
1499  *      idx = popcount(P & ((1 << S) - 1))
1500  *
1501  * That is... look at all of the varyings that come earlier and count them, the
1502  * count is the new index since plus one. Likewise, the total number of special
1503  * buffers required is simply popcount(P)
1504  */
1505
1506 enum pan_special_varying {
1507         PAN_VARY_GENERAL = 0,
1508         PAN_VARY_POSITION = 1,
1509         PAN_VARY_PSIZ = 2,
1510         PAN_VARY_PNTCOORD = 3,
1511         PAN_VARY_FACE = 4,
1512         PAN_VARY_FRAGCOORD = 5,
1513
1514         /* Keep last */
1515         PAN_VARY_MAX,
1516 };
1517
1518 /* Given a varying, figure out which index it correpsonds to */
1519
1520 static inline unsigned
1521 pan_varying_index(unsigned present, enum pan_special_varying v)
1522 {
1523         unsigned mask = (1 << v) - 1;
1524         return util_bitcount(present & mask);
1525 }
1526
1527 /* Get the base offset for XFB buffers, which by convention come after
1528  * everything else. Wrapper function for semantic reasons; by construction this
1529  * is just popcount. */
1530
1531 static inline unsigned
1532 pan_xfb_base(unsigned present)
1533 {
1534         return util_bitcount(present);
1535 }
1536
1537 /* Computes the present mask for varyings so we can start emitting varying records */
1538
1539 static inline unsigned
1540 pan_varying_present(
1541         struct panfrost_shader_state *vs,
1542         struct panfrost_shader_state *fs,
1543         unsigned quirks)
1544 {
1545         /* At the moment we always emit general and position buffers. Not
1546          * strictly necessary but usually harmless */
1547
1548         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1549
1550         /* Enable special buffers by the shader info */
1551
1552         if (vs->writes_point_size)
1553                 present |= (1 << PAN_VARY_PSIZ);
1554
1555         if (fs->reads_point_coord)
1556                 present |= (1 << PAN_VARY_PNTCOORD);
1557
1558         if (fs->reads_face)
1559                 present |= (1 << PAN_VARY_FACE);
1560
1561         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1562                 present |= (1 << PAN_VARY_FRAGCOORD);
1563
1564         /* Also, if we have a point sprite, we need a point coord buffer */
1565
1566         for (unsigned i = 0; i < fs->varying_count; i++)  {
1567                 gl_varying_slot loc = fs->varyings_loc[i];
1568
1569                 if (has_point_coord(fs->point_sprite_mask, loc))
1570                         present |= (1 << PAN_VARY_PNTCOORD);
1571         }
1572
1573         return present;
1574 }
1575
1576 /* Emitters for varying records */
1577
1578 static void
1579 pan_emit_vary(struct mali_attribute_packed *out,
1580                 unsigned present, enum pan_special_varying buf,
1581                 unsigned quirks, enum mali_format format,
1582                 unsigned offset)
1583 {
1584         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1585         unsigned swizzle = quirks & HAS_SWIZZLES ?
1586                         panfrost_get_default_swizzle(nr_channels) :
1587                         panfrost_bifrost_swizzle(nr_channels);
1588
1589         pan_pack(out, ATTRIBUTE, cfg) {
1590                 cfg.buffer_index = pan_varying_index(present, buf);
1591                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1592                 cfg.format = (format << 12) | swizzle;
1593                 cfg.offset = offset;
1594         }
1595 }
1596
1597 /* General varying that is unused */
1598
1599 static void
1600 pan_emit_vary_only(struct mali_attribute_packed *out,
1601                 unsigned present, unsigned quirks)
1602 {
1603         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1604 }
1605
1606 /* Special records */
1607
1608 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1609         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1610         [PAN_VARY_PSIZ]         = MALI_R16F,
1611         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1612         [PAN_VARY_FACE]         = MALI_R32I,
1613         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1614 };
1615
1616 static void
1617 pan_emit_vary_special(struct mali_attribute_packed *out,
1618                 unsigned present, enum pan_special_varying buf,
1619                 unsigned quirks)
1620 {
1621         assert(buf < PAN_VARY_MAX);
1622         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1623 }
1624
1625 static enum mali_format
1626 pan_xfb_format(enum mali_format format, unsigned nr)
1627 {
1628         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1629                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1630         else
1631                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1632 }
1633
1634 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1635  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1636  * value. */
1637
1638 static void
1639 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1640                 unsigned present,
1641                 unsigned max_xfb,
1642                 unsigned *streamout_offsets,
1643                 unsigned quirks,
1644                 enum mali_format format,
1645                 struct pipe_stream_output o)
1646 {
1647         unsigned swizzle = quirks & HAS_SWIZZLES ?
1648                         panfrost_get_default_swizzle(o.num_components) :
1649                         panfrost_bifrost_swizzle(o.num_components);
1650
1651         pan_pack(out, ATTRIBUTE, cfg) {
1652                 /* XFB buffers come after everything else */
1653                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1654                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1655
1656                 /* Override number of channels and precision to highp */
1657                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1658
1659                 /* Apply given offsets together */
1660                 cfg.offset = (o.dst_offset * 4) /* dwords */
1661                         + streamout_offsets[o.output_buffer];
1662         }
1663 }
1664
1665 /* Determine if we should capture a varying for XFB. This requires actually
1666  * having a buffer for it. If we don't capture it, we'll fallback to a general
1667  * varying path (linked or unlinked, possibly discarding the write) */
1668
1669 static bool
1670 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1671                 unsigned loc, unsigned max_xfb)
1672 {
1673         if (!(xfb->so_mask & (1ll << loc)))
1674                 return false;
1675
1676         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1677         return o->output_buffer < max_xfb;
1678 }
1679
1680 static void
1681 pan_emit_general_varying(struct mali_attribute_packed *out,
1682                 struct panfrost_shader_state *other,
1683                 struct panfrost_shader_state *xfb,
1684                 gl_varying_slot loc,
1685                 enum mali_format format,
1686                 unsigned present,
1687                 unsigned quirks,
1688                 unsigned *gen_offsets,
1689                 enum mali_format *gen_formats,
1690                 unsigned *gen_stride,
1691                 unsigned idx,
1692                 bool should_alloc)
1693 {
1694         /* Check if we're linked */
1695         signed other_idx = -1;
1696
1697         for (unsigned j = 0; j < other->varying_count; ++j) {
1698                 if (other->varyings_loc[j] == loc) {
1699                         other_idx = j;
1700                         break;
1701                 }
1702         }
1703
1704         if (other_idx < 0) {
1705                 pan_emit_vary_only(out, present, quirks);
1706                 return;
1707         }
1708
1709         unsigned offset = gen_offsets[other_idx];
1710
1711         if (should_alloc) {
1712                 /* We're linked, so allocate a space via a watermark allocation */
1713                 enum mali_format alt = other->varyings[other_idx];
1714
1715                 /* Do interpolation at minimum precision */
1716                 unsigned size_main = pan_varying_size(format);
1717                 unsigned size_alt = pan_varying_size(alt);
1718                 unsigned size = MIN2(size_main, size_alt);
1719
1720                 /* If a varying is marked for XFB but not actually captured, we
1721                  * should match the format to the format that would otherwise
1722                  * be used for XFB, since dEQP checks for invariance here. It's
1723                  * unclear if this is required by the spec. */
1724
1725                 if (xfb->so_mask & (1ull << loc)) {
1726                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1727                         format = pan_xfb_format(format, o->num_components);
1728                         size = pan_varying_size(format);
1729                 } else if (size == size_alt) {
1730                         format = alt;
1731                 }
1732
1733                 gen_offsets[idx] = *gen_stride;
1734                 gen_formats[other_idx] = format;
1735                 offset = *gen_stride;
1736                 *gen_stride += size;
1737         }
1738
1739         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1740 }
1741
1742 /* Higher-level wrapper around all of the above, classifying a varying into one
1743  * of the above types */
1744
1745 static void
1746 panfrost_emit_varying(
1747                 struct mali_attribute_packed *out,
1748                 struct panfrost_shader_state *stage,
1749                 struct panfrost_shader_state *other,
1750                 struct panfrost_shader_state *xfb,
1751                 unsigned present,
1752                 unsigned max_xfb,
1753                 unsigned *streamout_offsets,
1754                 unsigned quirks,
1755                 unsigned *gen_offsets,
1756                 enum mali_format *gen_formats,
1757                 unsigned *gen_stride,
1758                 unsigned idx,
1759                 bool should_alloc,
1760                 bool is_fragment)
1761 {
1762         gl_varying_slot loc = stage->varyings_loc[idx];
1763         enum mali_format format = stage->varyings[idx];
1764
1765         /* Override format to match linkage */
1766         if (!should_alloc && gen_formats[idx])
1767                 format = gen_formats[idx];
1768
1769         if (has_point_coord(stage->point_sprite_mask, loc)) {
1770                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1771         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1772                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1773                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1774         } else if (loc == VARYING_SLOT_POS) {
1775                 if (is_fragment)
1776                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1777                 else
1778                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1779         } else if (loc == VARYING_SLOT_PSIZ) {
1780                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1781         } else if (loc == VARYING_SLOT_PNTC) {
1782                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1783         } else if (loc == VARYING_SLOT_FACE) {
1784                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1785         } else {
1786                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1787                                 quirks, gen_offsets, gen_formats, gen_stride,
1788                                 idx, should_alloc);
1789         }
1790 }
1791
1792 static void
1793 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1794                 unsigned present,
1795                 enum pan_special_varying v,
1796                 unsigned special)
1797 {
1798         if (present & (1 << v)) {
1799                 unsigned idx = pan_varying_index(present, v);
1800
1801                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1802                         cfg.special = special;
1803                         cfg.type = 0;
1804                 }
1805         }
1806 }
1807
1808 void
1809 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1810                                  unsigned vertex_count,
1811                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1812                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1813                                  union midgard_primitive_size *primitive_size)
1814 {
1815         /* Load the shaders */
1816         struct panfrost_context *ctx = batch->ctx;
1817         struct panfrost_device *dev = pan_device(ctx->base.screen);
1818         struct panfrost_shader_state *vs, *fs;
1819         size_t vs_size, fs_size;
1820
1821         /* Allocate the varying descriptor */
1822
1823         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1824         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1825         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1826         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1827
1828         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1829                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1830
1831         struct pipe_stream_output_info *so = &vs->stream_output;
1832         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1833
1834         /* Check if this varying is linked by us. This is the case for
1835          * general-purpose, non-captured varyings. If it is, link it. If it's
1836          * not, use the provided stream out information to determine the
1837          * offset, since it was already linked for us. */
1838
1839         unsigned gen_offsets[32];
1840         enum mali_format gen_formats[32];
1841         memset(gen_offsets, 0, sizeof(gen_offsets));
1842         memset(gen_formats, 0, sizeof(gen_formats));
1843
1844         unsigned gen_stride = 0;
1845         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1846         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1847
1848         unsigned streamout_offsets[32];
1849
1850         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1851                 streamout_offsets[i] = panfrost_streamout_offset(
1852                                         so->stride[i],
1853                                         ctx->streamout.offsets[i],
1854                                         ctx->streamout.targets[i]);
1855         }
1856
1857         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1858         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1859
1860         for (unsigned i = 0; i < vs->varying_count; i++) {
1861                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1862                                 ctx->streamout.num_targets, streamout_offsets,
1863                                 dev->quirks,
1864                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1865         }
1866
1867         for (unsigned i = 0; i < fs->varying_count; i++) {
1868                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1869                                 ctx->streamout.num_targets, streamout_offsets,
1870                                 dev->quirks,
1871                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1872         }
1873
1874         unsigned xfb_base = pan_xfb_base(present);
1875         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1876                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1877                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1878         struct mali_attribute_buffer_packed *varyings =
1879                 (struct mali_attribute_buffer_packed *) T.cpu;
1880
1881         /* Emit the stream out buffers */
1882
1883         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1884                                                            ctx->vertex_count);
1885
1886         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1887                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1888                                         so->stride[i],
1889                                         ctx->streamout.offsets[i],
1890                                         out_count,
1891                                         ctx->streamout.targets[i]);
1892         }
1893
1894         panfrost_emit_varyings(batch,
1895                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1896                         gen_stride, vertex_count);
1897
1898         /* fp32 vec4 gl_Position */
1899         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1900                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1901                         sizeof(float) * 4, vertex_count);
1902
1903         if (present & (1 << PAN_VARY_PSIZ)) {
1904                 primitive_size->pointer = panfrost_emit_varyings(batch,
1905                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1906                                 2, vertex_count);
1907         }
1908
1909         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1910         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1911         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1912
1913         vertex_postfix->varyings = T.gpu;
1914         tiler_postfix->varyings = T.gpu;
1915
1916         vertex_postfix->varying_meta = trans.gpu;
1917         tiler_postfix->varying_meta = trans.gpu + vs_size;
1918 }
1919
1920 void
1921 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1922                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1923                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1924                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1925                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1926                                 union midgard_primitive_size *primitive_size)
1927 {
1928         struct panfrost_context *ctx = batch->ctx;
1929         struct panfrost_device *device = pan_device(ctx->base.screen);
1930         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1931         struct bifrost_payload_vertex bifrost_vertex = {0,};
1932         struct bifrost_payload_tiler bifrost_tiler = {0,};
1933         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1934         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1935         void *vp, *tp;
1936         size_t vp_size, tp_size;
1937
1938         if (device->quirks & IS_BIFROST) {
1939                 bifrost_vertex.prefix = *vertex_prefix;
1940                 bifrost_vertex.postfix = *vertex_postfix;
1941                 vp = &bifrost_vertex;
1942                 vp_size = sizeof(bifrost_vertex);
1943
1944                 bifrost_tiler.prefix = *tiler_prefix;
1945                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1946                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1947                 bifrost_tiler.postfix = *tiler_postfix;
1948                 tp = &bifrost_tiler;
1949                 tp_size = sizeof(bifrost_tiler);
1950         } else {
1951                 midgard_vertex.prefix = *vertex_prefix;
1952                 midgard_vertex.postfix = *vertex_postfix;
1953                 vp = &midgard_vertex;
1954                 vp_size = sizeof(midgard_vertex);
1955
1956                 midgard_tiler.prefix = *tiler_prefix;
1957                 midgard_tiler.postfix = *tiler_postfix;
1958                 midgard_tiler.primitive_size = *primitive_size;
1959                 tp = &midgard_tiler;
1960                 tp_size = sizeof(midgard_tiler);
1961         }
1962
1963         if (wallpapering) {
1964                 /* Inject in reverse order, with "predicted" job indices.
1965                  * THIS IS A HACK XXX */
1966                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
1967                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
1968                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1969                                  vp, vp_size, true);
1970                 return;
1971         }
1972
1973         /* If rasterizer discard is enable, only submit the vertex */
1974
1975         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1976                                            vp, vp_size, false);
1977
1978         if (ctx->rasterizer->base.rasterizer_discard)
1979                 return;
1980
1981         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
1982                          false);
1983 }
1984
1985 /* TODO: stop hardcoding this */
1986 mali_ptr
1987 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1988 {
1989         uint16_t locations[] = {
1990             128, 128,
1991             0, 256,
1992             0, 256,
1993             0, 256,
1994             0, 256,
1995             0, 256,
1996             0, 256,
1997             0, 256,
1998             0, 256,
1999             0, 256,
2000             0, 256,
2001             0, 256,
2002             0, 256,
2003             0, 256,
2004             0, 256,
2005             0, 256,
2006             0, 256,
2007             0, 256,
2008             0, 256,
2009             0, 256,
2010             0, 256,
2011             0, 256,
2012             0, 256,
2013             0, 256,
2014             0, 256,
2015             0, 256,
2016             0, 256,
2017             0, 256,
2018             0, 256,
2019             0, 256,
2020             0, 256,
2021             0, 256,
2022             128, 128,
2023             0, 0,
2024             0, 0,
2025             0, 0,
2026             0, 0,
2027             0, 0,
2028             0, 0,
2029             0, 0,
2030             0, 0,
2031             0, 0,
2032             0, 0,
2033             0, 0,
2034             0, 0,
2035             0, 0,
2036             0, 0,
2037             0, 0,
2038         };
2039
2040         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2041 }