src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static unsigned
 310 translate_tex_wrap(enum pipe_tex_wrap w)
 311 {
 312         switch (w) {
 313         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 314         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 315         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 316         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 317         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 318         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 319         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 320         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 321         default: unreachable("Invalid wrap");
 322         }
 323 }
 324
 325 /* The hardware compares in the wrong order order, so we have to flip before
 326  * encoding. Yes, really. */
 327
 328 static enum mali_func
 329 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 330 {
 331         if (!cso->compare_mode)
 332                 return MALI_FUNC_NEVER;
 333
 334         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 335         return panfrost_flip_compare_func(f);
 336 }
 337
 338 static enum mali_mipmap_mode
 339 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 340 {
 341         switch (f) {
 342         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 343         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 344         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 345         default: unreachable("Invalid");
 346         }
 347 }
 348
 349 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 350                                 struct mali_midgard_sampler_packed *hw)
 351 {
 352         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 353                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 354                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 355                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 356                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 357                 cfg.normalized_coordinates = cso->normalized_coords;
 358
 359                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 360
 361                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 362
 363                 /* If necessary, we disable mipmapping in the sampler descriptor by
 364                  * clamping the LOD as tight as possible (from 0 to epsilon,
 365                  * essentially -- remember these are fixed point numbers, so
 366                  * epsilon=1/256) */
 367
 368                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 369                         cfg.minimum_lod + 1 :
 370                         FIXED_16(cso->max_lod, false);
 371
 372                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 373                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 374                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 375
 376                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 377                 cfg.seamless_cube_map = cso->seamless_cube_map;
 378
 379                 cfg.border_color_r = cso->border_color.f[0];
 380                 cfg.border_color_g = cso->border_color.f[1];
 381                 cfg.border_color_b = cso->border_color.f[2];
 382                 cfg.border_color_a = cso->border_color.f[3];
 383         }
 384 }
 385
 386 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 387                                         struct mali_bifrost_sampler_packed *hw)
 388 {
 389         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 390                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 391                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 392                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 393                 cfg.normalized_coordinates = cso->normalized_coords;
 394
 395                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 396                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 397                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 398
 399                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 400                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 401                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 402
 403                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 404                 cfg.seamless_cube_map = cso->seamless_cube_map;
 405         }
 406 }
 407
 408 static bool
 409 panfrost_fs_required(
 410                 struct panfrost_shader_state *fs,
 411                 struct panfrost_blend_final *blend,
 412                 unsigned rt_count)
 413 {
 414         /* If we generally have side effects */
 415         if (fs->fs_sidefx)
 416                 return true;
 417
 418         /* If colour is written we need to execute */
 419         for (unsigned i = 0; i < rt_count; ++i) {
 420                 if (!blend[i].no_colour)
 421                         return true;
 422         }
 423
 424         /* If depth is written and not implied we need to execute.
 425          * TODO: Predicate on Z/S writes being enabled */
 426         return (fs->writes_depth || fs->writes_stencil);
 427 }
 428
 429 static void
 430 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 431                 struct panfrost_blend_final *blend)
 432 {
 433         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 434         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 435         unsigned rt_count = batch->key.nr_cbufs;
 436
 437         struct bifrost_blend_rt *brts = rts;
 438
 439         /* Disable blending for depth-only on Bifrost */
 440
 441         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 442                 brts[0].unk2 = 0x3;
 443
 444         for (unsigned i = 0; i < rt_count; ++i) {
 445                 struct mali_blend_flags_packed flags = {};
 446
 447                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 448                         if (blend[i].no_colour) {
 449                                 cfg.enable = false;
 450                                 break;
 451                         }
 452
 453                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 454
 455                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 456                         cfg.load_destination = blend[i].load_dest;
 457                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 458
 459                         if (!(dev->quirks & IS_BIFROST))
 460                                 cfg.midgard_blend_shader = blend[i].is_shader;
 461                 }
 462
 463                 if (dev->quirks & IS_BIFROST) {
 464                         brts[i].flags = flags.opaque[0];
 465
 466                         if (blend[i].is_shader) {
 467                                 /* The blend shader's address needs to be at
 468                                  * the same top 32 bit as the fragment shader.
 469                                  * TODO: Ensure that's always the case.
 470                                  */
 471                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 472                                        (fs->bo->gpu & (0xffffffffull << 32)));
 473                                 brts[i].shader = blend[i].shader.gpu;
 474                                 brts[i].unk2 = 0x0;
 475                         } else {
 476                                 enum pipe_format format = batch->key.cbufs[i]->format;
 477                                 const struct util_format_description *format_desc;
 478                                 format_desc = util_format_description(format);
 479
 480                                 brts[i].equation = blend[i].equation.equation;
 481
 482                                 /* TODO: this is a bit more complicated */
 483                                 brts[i].constant = blend[i].equation.constant;
 484
 485                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 486
 487                                 /* 0x19 disables blending and forces REPLACE
 488                                  * mode (equivalent to rgb_mode = alpha_mode =
 489                                  * x122, colour mask = 0xF). 0x1a allows
 490                                  * blending. */
 491                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 492
 493                                 brts[i].shader_type = fs->blend_types[i];
 494                         }
 495                 } else {
 496                         pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
 497                                 cfg.flags = flags;
 498
 499                                 if (blend[i].is_shader) {
 500                                         cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 501                                 } else {
 502                                         cfg.equation = blend[i].equation.equation.opaque[0];
 503                                         cfg.constant = blend[i].equation.constant;
 504                                 }
 505                         }
 506
 507                         rts += MALI_MIDGARD_BLEND_LENGTH;
 508                 }
 509         }
 510 }
 511
 512 static void
 513 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 514                                struct mali_state_packed *fragmeta,
 515                                struct panfrost_blend_final *blend)
 516 {
 517         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 518         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 519         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 520         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 521         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 522         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 523
 524         /* Built up here */
 525         struct mali_shader_packed shader = fs->shader;
 526         struct mali_preload_packed preload = fs->preload;
 527         uint32_t properties;
 528         struct mali_multisample_misc_packed multisample_misc;
 529         struct mali_stencil_mask_misc_packed stencil_mask_misc;
 530         union midgard_blend sfbd_blend = { 0 };
 531
 532         if (!panfrost_fs_required(fs, blend, rt_count)) {
 533                 if (dev->quirks & IS_BIFROST) {
 534                         pan_pack(&shader, SHADER, cfg) {}
 535
 536                         pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 537                                 cfg.unknown = 0x950020; /* XXX */
 538                                 cfg.early_z_enable = true;
 539                         }
 540
 541                         preload.opaque[0] = 0;
 542                 } else {
 543                         pan_pack(&shader, SHADER, cfg) {
 544                                 cfg.shader = 0x1;
 545                         }
 546
 547                         pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 548                                 cfg.work_register_count = 1;
 549                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 550                                 cfg.early_z_enable = true;
 551                         }
 552                 }
 553         } else if (dev->quirks & IS_BIFROST) {
 554                 bool no_blend = true;
 555
 556                 for (unsigned i = 0; i < rt_count; ++i)
 557                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 558
 559                 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
 560                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 561                 }
 562
 563                 /* Combine with prepacked properties */
 564                 properties |= fs->properties.opaque[0];
 565         } else {
 566                 /* Reasons to disable early-Z from a shader perspective */
 567                 bool late_z = fs->can_discard || fs->writes_global ||
 568                         fs->writes_depth || fs->writes_stencil;
 569
 570                 /* If either depth or stencil is enabled, discard matters */
 571                 bool zs_enabled =
 572                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 573                         zsa->base.stencil[0].enabled;
 574
 575                 bool has_blend_shader = false;
 576
 577                 for (unsigned c = 0; c < rt_count; ++c)
 578                         has_blend_shader |= blend[c].is_shader;
 579
 580                 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
 581                         /* TODO: Reduce this limit? */
 582                         if (has_blend_shader)
 583                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 584                         else
 585                                 cfg.work_register_count = fs->work_reg_count;
 586
 587                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 588                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 589                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 590                 }
 591
 592                 properties |= fs->properties.opaque[0];
 593         }
 594
 595         pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
 596                 bool msaa = rast->multisample;
 597                 cfg.multisample_enable = msaa;
 598                 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
 599
 600                 /* EXT_shader_framebuffer_fetch requires per-sample */
 601                 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 602                 cfg.evaluate_per_sample = msaa && per_sample;
 603
 604                 if (dev->quirks & MIDGARD_SFBD) {
 605                         cfg.sfbd_load_destination = blend[0].load_dest;
 606                         cfg.sfbd_blend_shader = blend[0].is_shader;
 607                 }
 608
 609                 cfg.depth_function = zsa->base.depth.enabled ?
 610                         panfrost_translate_compare_func(zsa->base.depth.func) :
 611                         MALI_FUNC_ALWAYS;
 612
 613                 cfg.depth_write_mask = zsa->base.depth.writemask;
 614                 cfg.near_discard = rast->depth_clip_near;
 615                 cfg.far_discard = rast->depth_clip_far;
 616                 cfg.unknown_2 = true;
 617         }
 618
 619         pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
 620                 cfg.stencil_mask_front = zsa->stencil_mask_front;
 621                 cfg.stencil_mask_back = zsa->stencil_mask_back;
 622                 cfg.stencil_enable = zsa->base.stencil[0].enabled;
 623                 cfg.alpha_to_coverage = alpha_to_coverage;
 624
 625                 if (dev->quirks & MIDGARD_SFBD) {
 626                         cfg.sfbd_write_enable = !blend[0].no_colour;
 627                         cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
 628                         cfg.sfbd_dither_disable = !ctx->blend->base.dither;
 629                 }
 630
 631                 cfg.unknown_1 = 0x7;
 632                 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
 633                 cfg.single_sampled_lines = !rast->multisample;
 634         }
 635
 636         if (dev->quirks & MIDGARD_SFBD) {
 637                 if (blend[0].is_shader) {
 638                         sfbd_blend.shader = blend[0].shader.gpu |
 639                                 blend[0].shader.first_tag;
 640                 } else {
 641                         sfbd_blend.equation = blend[0].equation.equation;
 642                         sfbd_blend.constant = blend[0].equation.constant;
 643                 }
 644         } else if (!(dev->quirks & IS_BIFROST)) {
 645                 /* Bug where MRT-capable hw apparently reads the last blend
 646                  * shader from here instead of the usual location? */
 647
 648                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 649                         if (!blend[rt].is_shader)
 650                                 continue;
 651
 652                         sfbd_blend.shader = blend[rt].shader.gpu |
 653                                                  blend[rt].shader.first_tag;
 654                         break;
 655                 }
 656         }
 657
 658         pan_pack(fragmeta, STATE_OPAQUE, cfg) {
 659                 cfg.shader = fs->shader;
 660                 cfg.properties = properties;
 661                 cfg.depth_units = rast->offset_units * 2.0f;
 662                 cfg.depth_factor = rast->offset_scale;
 663                 cfg.multisample_misc = multisample_misc;
 664                 cfg.stencil_mask_misc = stencil_mask_misc;
 665
 666                 cfg.stencil_front = zsa->stencil_front;
 667                 cfg.stencil_back = zsa->stencil_back;
 668
 669                 /* Bottom bits for stencil ref, exactly one word */
 670                 bool back_enab = zsa->base.stencil[1].enabled;
 671                 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
 672                 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
 673
 674                 if (dev->quirks & IS_BIFROST)
 675                         cfg.preload = preload;
 676                 else
 677                         memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
 678         }
 679 }
 680
 681 mali_ptr
 682 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
 683 {
 684         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
 685
 686         panfrost_batch_add_bo(batch, ss->bo,
 687                               PAN_BO_ACCESS_PRIVATE |
 688                               PAN_BO_ACCESS_READ |
 689                               PAN_BO_ACCESS_VERTEX_TILER);
 690
 691         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
 692                               PAN_BO_ACCESS_PRIVATE |
 693                               PAN_BO_ACCESS_READ |
 694                               PAN_BO_ACCESS_VERTEX_TILER);
 695
 696         return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
 697 }
 698
 699 mali_ptr
 700 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
 701 {
 702         struct panfrost_context *ctx = batch->ctx;
 703         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 704
 705         /* Add the shader BO to the batch. */
 706         panfrost_batch_add_bo(batch, ss->bo,
 707                               PAN_BO_ACCESS_PRIVATE |
 708                               PAN_BO_ACCESS_READ |
 709                               PAN_BO_ACCESS_FRAGMENT);
 710
 711         struct panfrost_device *dev = pan_device(ctx->base.screen);
 712         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 713         void *rts = NULL;
 714         struct panfrost_transfer xfer;
 715         unsigned rt_size;
 716
 717         if (dev->quirks & MIDGARD_SFBD)
 718                 rt_size = 0;
 719         else if (dev->quirks & IS_BIFROST)
 720                 rt_size = sizeof(struct bifrost_blend_rt);
 721         else
 722                 rt_size = sizeof(struct midgard_blend_rt);
 723
 724         unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
 725
 726         if (rt_size)
 727                 rts = rzalloc_size(ctx, rt_size * rt_count);
 728
 729         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 730
 731         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 732                 blend[c] = panfrost_get_blend_for_context(ctx, c);
 733
 734         if (!(dev->quirks & MIDGARD_SFBD))
 735                 panfrost_emit_blend(batch, rts, blend);
 736         else
 737                 batch->draws |= PIPE_CLEAR_COLOR0;
 738
 739         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
 740
 741         panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
 742
 743         memcpy(xfer.cpu + MALI_STATE_LENGTH, rts, rt_size * rt_count);
 744
 745         if (rt_size)
 746                 ralloc_free(rts);
 747
 748         return xfer.gpu;
 749 }
 750
 751 void
 752 panfrost_emit_viewport(struct panfrost_batch *batch,
 753                        struct mali_vertex_tiler_postfix *tiler_postfix)
 754 {
 755         struct panfrost_context *ctx = batch->ctx;
 756         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 757         const struct pipe_scissor_state *ss = &ctx->scissor;
 758         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 759         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 760
 761         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 762          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 763          * translate + |scale|, so the ordering is correct here. */
 764         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 765         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 766         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 767         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 768         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 769         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 770
 771         /* Scissor to the intersection of viewport and to the scissor, clamped
 772          * to the framebuffer */
 773
 774         unsigned minx = MIN2(fb->width, vp_minx);
 775         unsigned maxx = MIN2(fb->width, vp_maxx);
 776         unsigned miny = MIN2(fb->height, vp_miny);
 777         unsigned maxy = MIN2(fb->height, vp_maxy);
 778
 779         if (ss && rast->scissor) {
 780                 minx = MAX2(ss->minx, minx);
 781                 miny = MAX2(ss->miny, miny);
 782                 maxx = MIN2(ss->maxx, maxx);
 783                 maxy = MIN2(ss->maxy, maxy);
 784         }
 785
 786         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 787
 788         pan_pack(T.cpu, VIEWPORT, cfg) {
 789                 cfg.scissor_minimum_x = minx;
 790                 cfg.scissor_minimum_y = miny;
 791                 cfg.scissor_maximum_x = maxx - 1;
 792                 cfg.scissor_maximum_y = maxy - 1;
 793
 794                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 795                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 796         }
 797
 798         tiler_postfix->viewport = T.gpu;
 799         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 800 }
 801
 802 static mali_ptr
 803 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 804                                  enum pipe_shader_type st,
 805                                  struct panfrost_constant_buffer *buf,
 806                                  unsigned index)
 807 {
 808         struct pipe_constant_buffer *cb = &buf->cb[index];
 809         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 810
 811         if (rsrc) {
 812                 panfrost_batch_add_bo(batch, rsrc->bo,
 813                                       PAN_BO_ACCESS_SHARED |
 814                                       PAN_BO_ACCESS_READ |
 815                                       panfrost_bo_access_for_stage(st));
 816
 817                 /* Alignment gauranteed by
 818                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 819                 return rsrc->bo->gpu + cb->buffer_offset;
 820         } else if (cb->user_buffer) {
 821                 return panfrost_pool_upload_aligned(&batch->pool,
 822                                                  cb->user_buffer +
 823                                                  cb->buffer_offset,
 824                                                  cb->buffer_size, 16);
 825         } else {
 826                 unreachable("No constant buffer");
 827         }
 828 }
 829
 830 struct sysval_uniform {
 831         union {
 832                 float f[4];
 833                 int32_t i[4];
 834                 uint32_t u[4];
 835                 uint64_t du[2];
 836         };
 837 };
 838
 839 static void
 840 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 841                                       struct sysval_uniform *uniform)
 842 {
 843         struct panfrost_context *ctx = batch->ctx;
 844         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 845
 846         uniform->f[0] = vp->scale[0];
 847         uniform->f[1] = vp->scale[1];
 848         uniform->f[2] = vp->scale[2];
 849 }
 850
 851 static void
 852 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 853                                        struct sysval_uniform *uniform)
 854 {
 855         struct panfrost_context *ctx = batch->ctx;
 856         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 857
 858         uniform->f[0] = vp->translate[0];
 859         uniform->f[1] = vp->translate[1];
 860         uniform->f[2] = vp->translate[2];
 861 }
 862
 863 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 864                                        enum pipe_shader_type st,
 865                                        unsigned int sysvalid,
 866                                        struct sysval_uniform *uniform)
 867 {
 868         struct panfrost_context *ctx = batch->ctx;
 869         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 870         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 871         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 872         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 873
 874         assert(dim);
 875         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 876
 877         if (dim > 1)
 878                 uniform->i[1] = u_minify(tex->texture->height0,
 879                                          tex->u.tex.first_level);
 880
 881         if (dim > 2)
 882                 uniform->i[2] = u_minify(tex->texture->depth0,
 883                                          tex->u.tex.first_level);
 884
 885         if (is_array)
 886                 uniform->i[dim] = tex->texture->array_size;
 887 }
 888
 889 static void
 890 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 891                             enum pipe_shader_type st,
 892                             unsigned ssbo_id,
 893                             struct sysval_uniform *uniform)
 894 {
 895         struct panfrost_context *ctx = batch->ctx;
 896
 897         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 898         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 899
 900         /* Compute address */
 901         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 902
 903         panfrost_batch_add_bo(batch, bo,
 904                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 905                               panfrost_bo_access_for_stage(st));
 906
 907         /* Upload address and size as sysval */
 908         uniform->du[0] = bo->gpu + sb.buffer_offset;
 909         uniform->u[2] = sb.buffer_size;
 910 }
 911
 912 static void
 913 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 914                                enum pipe_shader_type st,
 915                                unsigned samp_idx,
 916                                struct sysval_uniform *uniform)
 917 {
 918         struct panfrost_context *ctx = batch->ctx;
 919         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 920
 921         uniform->f[0] = sampl->min_lod;
 922         uniform->f[1] = sampl->max_lod;
 923         uniform->f[2] = sampl->lod_bias;
 924
 925         /* Even without any errata, Midgard represents "no mipmapping" as
 926          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 927          * panfrost_create_sampler_state which also explains our choice of
 928          * epsilon value (again to keep behaviour consistent) */
 929
 930         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 931                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 932 }
 933
 934 static void
 935 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 936                                        struct sysval_uniform *uniform)
 937 {
 938         struct panfrost_context *ctx = batch->ctx;
 939
 940         uniform->u[0] = ctx->compute_grid->grid[0];
 941         uniform->u[1] = ctx->compute_grid->grid[1];
 942         uniform->u[2] = ctx->compute_grid->grid[2];
 943 }
 944
 945 static void
 946 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 947                         struct panfrost_shader_state *ss,
 948                         enum pipe_shader_type st)
 949 {
 950         struct sysval_uniform *uniforms = (void *)buf;
 951
 952         for (unsigned i = 0; i < ss->sysval_count; ++i) {
 953                 int sysval = ss->sysval[i];
 954
 955                 switch (PAN_SYSVAL_TYPE(sysval)) {
 956                 case PAN_SYSVAL_VIEWPORT_SCALE:
 957                         panfrost_upload_viewport_scale_sysval(batch,
 958                                                               &uniforms[i]);
 959                         break;
 960                 case PAN_SYSVAL_VIEWPORT_OFFSET:
 961                         panfrost_upload_viewport_offset_sysval(batch,
 962                                                                &uniforms[i]);
 963                         break;
 964                 case PAN_SYSVAL_TEXTURE_SIZE:
 965                         panfrost_upload_txs_sysval(batch, st,
 966                                                    PAN_SYSVAL_ID(sysval),
 967                                                    &uniforms[i]);
 968                         break;
 969                 case PAN_SYSVAL_SSBO:
 970                         panfrost_upload_ssbo_sysval(batch, st,
 971                                                     PAN_SYSVAL_ID(sysval),
 972                                                     &uniforms[i]);
 973                         break;
 974                 case PAN_SYSVAL_NUM_WORK_GROUPS:
 975                         panfrost_upload_num_work_groups_sysval(batch,
 976                                                                &uniforms[i]);
 977                         break;
 978                 case PAN_SYSVAL_SAMPLER:
 979                         panfrost_upload_sampler_sysval(batch, st,
 980                                                        PAN_SYSVAL_ID(sysval),
 981                                                        &uniforms[i]);
 982                         break;
 983                 default:
 984                         assert(0);
 985                 }
 986         }
 987 }
 988
 989 static const void *
 990 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
 991                                  unsigned index)
 992 {
 993         struct pipe_constant_buffer *cb = &buf->cb[index];
 994         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 995
 996         if (rsrc)
 997                 return rsrc->bo->cpu;
 998         else if (cb->user_buffer)
 999                 return cb->user_buffer;
1000         else
1001                 unreachable("No constant buffer");
1002 }
1003
1004 void
1005 panfrost_emit_const_buf(struct panfrost_batch *batch,
1006                         enum pipe_shader_type stage,
1007                         struct mali_vertex_tiler_postfix *postfix)
1008 {
1009         struct panfrost_context *ctx = batch->ctx;
1010         struct panfrost_shader_variants *all = ctx->shader[stage];
1011
1012         if (!all)
1013                 return;
1014
1015         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1016
1017         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1018
1019         /* Uniforms are implicitly UBO #0 */
1020         bool has_uniforms = buf->enabled_mask & (1 << 0);
1021
1022         /* Allocate room for the sysval and the uniforms */
1023         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1024         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1025         size_t size = sys_size + uniform_size;
1026         struct panfrost_transfer transfer =
1027                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1028
1029         /* Upload sysvals requested by the shader */
1030         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1031
1032         /* Upload uniforms */
1033         if (has_uniforms && uniform_size) {
1034                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1035                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1036         }
1037
1038         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1039          * uploaded, so it's always included. The count is the highest UBO
1040          * addressable -- gaps are included. */
1041
1042         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1043
1044         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1045         struct panfrost_transfer ubos =
1046                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1047                                 MALI_UNIFORM_BUFFER_LENGTH);
1048
1049         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1050
1051         /* Upload uniforms as a UBO */
1052
1053         if (size) {
1054                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1055                         cfg.entries = DIV_ROUND_UP(size, 16);
1056                         cfg.pointer = transfer.gpu;
1057                 }
1058         } else {
1059                 *ubo_ptr = 0;
1060         }
1061
1062         /* The rest are honest-to-goodness UBOs */
1063
1064         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1065                 size_t usz = buf->cb[ubo].buffer_size;
1066                 bool enabled = buf->enabled_mask & (1 << ubo);
1067                 bool empty = usz == 0;
1068
1069                 if (!enabled || empty) {
1070                         ubo_ptr[ubo] = 0;
1071                         continue;
1072                 }
1073
1074                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1075                         cfg.entries = DIV_ROUND_UP(usz, 16);
1076                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1077                                         stage, buf, ubo);
1078                 }
1079         }
1080
1081         postfix->uniforms = transfer.gpu;
1082         postfix->uniform_buffers = ubos.gpu;
1083
1084         buf->dirty_mask = 0;
1085 }
1086
1087 void
1088 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1089                             const struct pipe_grid_info *info,
1090                             struct midgard_payload_vertex_tiler *vtp)
1091 {
1092         struct panfrost_context *ctx = batch->ctx;
1093         struct panfrost_device *dev = pan_device(ctx->base.screen);
1094         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1095         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1096         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1097                                                            128));
1098
1099         unsigned log2_instances =
1100                 util_logbase2_ceil(info->grid[0]) +
1101                 util_logbase2_ceil(info->grid[1]) +
1102                 util_logbase2_ceil(info->grid[2]);
1103
1104         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1105         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1106                                                                   shared_size,
1107                                                                   1);
1108
1109         struct mali_shared_memory shared = {
1110                 .shared_memory = bo->gpu,
1111                 .shared_workgroup_count = log2_instances,
1112                 .shared_shift = util_logbase2(single_size) + 1
1113         };
1114
1115         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1116                                                                sizeof(shared), 64);
1117 }
1118
1119 static mali_ptr
1120 panfrost_get_tex_desc(struct panfrost_batch *batch,
1121                       enum pipe_shader_type st,
1122                       struct panfrost_sampler_view *view)
1123 {
1124         if (!view)
1125                 return (mali_ptr) 0;
1126
1127         struct pipe_sampler_view *pview = &view->base;
1128         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1129
1130         /* Add the BO to the job so it's retained until the job is done. */
1131
1132         panfrost_batch_add_bo(batch, rsrc->bo,
1133                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1134                               panfrost_bo_access_for_stage(st));
1135
1136         panfrost_batch_add_bo(batch, view->bo,
1137                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1138                               panfrost_bo_access_for_stage(st));
1139
1140         return view->bo->gpu;
1141 }
1142
1143 static void
1144 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1145                              struct pipe_context *pctx)
1146 {
1147         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1148         if (view->texture_bo != rsrc->bo->gpu ||
1149             view->modifier != rsrc->modifier) {
1150                 panfrost_bo_unreference(view->bo);
1151                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1152         }
1153 }
1154
1155 void
1156 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1157                                   enum pipe_shader_type stage,
1158                                   struct mali_vertex_tiler_postfix *postfix)
1159 {
1160         struct panfrost_context *ctx = batch->ctx;
1161         struct panfrost_device *device = pan_device(ctx->base.screen);
1162
1163         if (!ctx->sampler_view_count[stage])
1164                 return;
1165
1166         if (device->quirks & IS_BIFROST) {
1167                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1168                                 MALI_BIFROST_TEXTURE_LENGTH *
1169                                 ctx->sampler_view_count[stage],
1170                                 MALI_BIFROST_TEXTURE_LENGTH);
1171
1172                 struct mali_bifrost_texture_packed *out =
1173                         (struct mali_bifrost_texture_packed *) T.cpu;
1174
1175                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1176                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1177                         struct pipe_sampler_view *pview = &view->base;
1178                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1179
1180                         panfrost_update_sampler_view(view, &ctx->base);
1181                         out[i] = view->bifrost_descriptor;
1182
1183                         /* Add the BOs to the job so they are retained until the job is done. */
1184
1185                         panfrost_batch_add_bo(batch, rsrc->bo,
1186                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1187                                               panfrost_bo_access_for_stage(stage));
1188
1189                         panfrost_batch_add_bo(batch, view->bo,
1190                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1191                                               panfrost_bo_access_for_stage(stage));
1192                 }
1193
1194                 postfix->textures = T.gpu;
1195         } else {
1196                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1197
1198                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1199                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1200
1201                         panfrost_update_sampler_view(view, &ctx->base);
1202
1203                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1204                 }
1205
1206                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1207                                                               trampolines,
1208                                                               sizeof(uint64_t) *
1209                                                               ctx->sampler_view_count[stage],
1210                                                               sizeof(uint64_t));
1211         }
1212 }
1213
1214 void
1215 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1216                                   enum pipe_shader_type stage,
1217                                   struct mali_vertex_tiler_postfix *postfix)
1218 {
1219         struct panfrost_context *ctx = batch->ctx;
1220
1221         if (!ctx->sampler_count[stage])
1222                 return;
1223
1224         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1225         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1226
1227         size_t sz = desc_size * ctx->sampler_count[stage];
1228         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1229         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1230
1231         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1232                 out[i] = ctx->samplers[stage][i]->hw;
1233
1234         postfix->sampler_descriptor = T.gpu;
1235 }
1236
1237 void
1238 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1239                           struct mali_vertex_tiler_postfix *vertex_postfix)
1240 {
1241         struct panfrost_context *ctx = batch->ctx;
1242         struct panfrost_vertex_state *so = ctx->vertex;
1243         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1244
1245         unsigned instance_shift = vertex_postfix->instance_shift;
1246         unsigned instance_odd = vertex_postfix->instance_odd;
1247
1248         /* Worst case: everything is NPOT, which is only possible if instancing
1249          * is enabled. Otherwise single record is gauranteed */
1250         bool could_npot = instance_shift || instance_odd;
1251
1252         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1253                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1254                         (could_npot ? 2 : 1),
1255                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1256
1257         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1258                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1259                         MALI_ATTRIBUTE_LENGTH);
1260
1261         struct mali_attribute_buffer_packed *bufs =
1262                 (struct mali_attribute_buffer_packed *) S.cpu;
1263
1264         struct mali_attribute_packed *out =
1265                 (struct mali_attribute_packed *) T.cpu;
1266
1267         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1268         unsigned k = 0;
1269
1270         for (unsigned i = 0; i < so->num_elements; ++i) {
1271                 /* We map buffers 1:1 with the attributes, which
1272                  * means duplicating some vertex buffers (who cares? aside from
1273                  * maybe some caching implications but I somehow doubt that
1274                  * matters) */
1275
1276                 struct pipe_vertex_element *elem = &so->pipe[i];
1277                 unsigned vbi = elem->vertex_buffer_index;
1278                 attrib_to_buffer[i] = k;
1279
1280                 if (!(ctx->vb_mask & (1 << vbi)))
1281                         continue;
1282
1283                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1284                 struct panfrost_resource *rsrc;
1285
1286                 rsrc = pan_resource(buf->buffer.resource);
1287                 if (!rsrc)
1288                         continue;
1289
1290                 /* Add a dependency of the batch on the vertex buffer */
1291                 panfrost_batch_add_bo(batch, rsrc->bo,
1292                                       PAN_BO_ACCESS_SHARED |
1293                                       PAN_BO_ACCESS_READ |
1294                                       PAN_BO_ACCESS_VERTEX_TILER);
1295
1296                 /* Mask off lower bits, see offset fixup below */
1297                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1298                 mali_ptr addr = raw_addr & ~63;
1299
1300                 /* Since we advanced the base pointer, we shrink the buffer
1301                  * size, but add the offset we subtracted */
1302                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1303                         - buf->buffer_offset;
1304
1305                 /* When there is a divisor, the hardware-level divisor is
1306                  * the product of the instance divisor and the padded count */
1307                 unsigned divisor = elem->instance_divisor;
1308                 unsigned hw_divisor = ctx->padded_count * divisor;
1309                 unsigned stride = buf->stride;
1310
1311                 /* If there's a divisor(=1) but no instancing, we want every
1312                  * attribute to be the same */
1313
1314                 if (divisor && ctx->instance_count == 1)
1315                         stride = 0;
1316
1317                 if (!divisor || ctx->instance_count <= 1) {
1318                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1319                                 if (ctx->instance_count > 1)
1320                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1321
1322                                 cfg.pointer = addr;
1323                                 cfg.stride = stride;
1324                                 cfg.size = size;
1325                                 cfg.divisor_r = instance_shift;
1326                                 cfg.divisor_p = instance_odd;
1327                         }
1328                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1329                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1330                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1331                                 cfg.pointer = addr;
1332                                 cfg.stride = stride;
1333                                 cfg.size = size;
1334                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1335                         }
1336
1337                 } else {
1338                         unsigned shift = 0, extra_flags = 0;
1339
1340                         unsigned magic_divisor =
1341                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1342
1343                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1344                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1345                                 cfg.pointer = addr;
1346                                 cfg.stride = stride;
1347                                 cfg.size = size;
1348
1349                                 cfg.divisor_r = shift;
1350                                 cfg.divisor_e = extra_flags;
1351                         }
1352
1353                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1354                                 cfg.divisor_numerator = magic_divisor;
1355                                 cfg.divisor = divisor;
1356                         }
1357
1358                         ++k;
1359                 }
1360
1361                 ++k;
1362         }
1363
1364         /* Add special gl_VertexID/gl_InstanceID buffers */
1365
1366         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1367                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1368
1369                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1370                         cfg.buffer_index = k++;
1371                         cfg.format = so->formats[PAN_VERTEX_ID];
1372                 }
1373
1374                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1375
1376                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1377                         cfg.buffer_index = k++;
1378                         cfg.format = so->formats[PAN_INSTANCE_ID];
1379                 }
1380         }
1381
1382         /* Attribute addresses require 64-byte alignment, so let:
1383          *
1384          *      base' = base & ~63 = base - (base & 63)
1385          *      offset' = offset + (base & 63)
1386          *
1387          * Since base' + offset' = base + offset, these are equivalent
1388          * addressing modes and now base is 64 aligned.
1389          */
1390
1391         unsigned start = vertex_postfix->offset_start;
1392
1393         for (unsigned i = 0; i < so->num_elements; ++i) {
1394                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1395                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1396
1397                 /* Adjust by the masked off bits of the offset. Make sure we
1398                  * read src_offset from so->hw (which is not GPU visible)
1399                  * rather than target (which is) due to caching effects */
1400
1401                 unsigned src_offset = so->pipe[i].src_offset;
1402
1403                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1404                 src_offset += (buf->buffer_offset & 63);
1405
1406                 /* Also, somewhat obscurely per-instance data needs to be
1407                  * offset in response to a delayed start in an indexed draw */
1408
1409                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1410                         src_offset -= buf->stride * start;
1411
1412                 pan_pack(out + i, ATTRIBUTE, cfg) {
1413                         cfg.buffer_index = attrib_to_buffer[i];
1414                         cfg.format = so->formats[i];
1415                         cfg.offset = src_offset;
1416                 }
1417         }
1418
1419         vertex_postfix->attributes = S.gpu;
1420         vertex_postfix->attribute_meta = T.gpu;
1421 }
1422
1423 static mali_ptr
1424 panfrost_emit_varyings(struct panfrost_batch *batch,
1425                 struct mali_attribute_buffer_packed *slot,
1426                 unsigned stride, unsigned count)
1427 {
1428         unsigned size = stride * count;
1429         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1430
1431         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1432                 cfg.stride = stride;
1433                 cfg.size = size;
1434                 cfg.pointer = ptr;
1435         }
1436
1437         return ptr;
1438 }
1439
1440 static unsigned
1441 panfrost_streamout_offset(unsigned stride, unsigned offset,
1442                         struct pipe_stream_output_target *target)
1443 {
1444         return (target->buffer_offset + (offset * stride * 4)) & 63;
1445 }
1446
1447 static void
1448 panfrost_emit_streamout(struct panfrost_batch *batch,
1449                         struct mali_attribute_buffer_packed *slot,
1450                         unsigned stride_words, unsigned offset, unsigned count,
1451                         struct pipe_stream_output_target *target)
1452 {
1453         unsigned stride = stride_words * 4;
1454         unsigned max_size = target->buffer_size;
1455         unsigned expected_size = stride * count;
1456
1457         /* Grab the BO and bind it to the batch */
1458         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1459
1460         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1461          * the perspective of the TILER and FRAGMENT.
1462          */
1463         panfrost_batch_add_bo(batch, bo,
1464                               PAN_BO_ACCESS_SHARED |
1465                               PAN_BO_ACCESS_RW |
1466                               PAN_BO_ACCESS_VERTEX_TILER |
1467                               PAN_BO_ACCESS_FRAGMENT);
1468
1469         /* We will have an offset applied to get alignment */
1470         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1471
1472         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1473                 cfg.pointer = (addr & ~63);
1474                 cfg.stride = stride;
1475                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1476         }
1477 }
1478
1479 static bool
1480 has_point_coord(unsigned mask, gl_varying_slot loc)
1481 {
1482         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1483                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1484         else if (loc == VARYING_SLOT_PNTC)
1485                 return (mask & (1 << 8));
1486         else
1487                 return false;
1488 }
1489
1490 /* Helpers for manipulating stream out information so we can pack varyings
1491  * accordingly. Compute the src_offset for a given captured varying */
1492
1493 static struct pipe_stream_output *
1494 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1495 {
1496         for (unsigned i = 0; i < info->num_outputs; ++i) {
1497                 if (info->output[i].register_index == loc)
1498                         return &info->output[i];
1499         }
1500
1501         unreachable("Varying not captured");
1502 }
1503
1504 static unsigned
1505 pan_varying_size(enum mali_format fmt)
1506 {
1507         unsigned type = MALI_EXTRACT_TYPE(fmt);
1508         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1509         unsigned bits = MALI_EXTRACT_BITS(fmt);
1510         unsigned bpc = 0;
1511
1512         if (bits == MALI_CHANNEL_FLOAT) {
1513                 /* No doubles */
1514                 bool fp16 = (type == MALI_FORMAT_SINT);
1515                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1516
1517                 bpc = fp16 ? 2 : 4;
1518         } else {
1519                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1520
1521                 /* See the enums */
1522                 bits = 1 << bits;
1523                 assert(bits >= 8);
1524                 bpc = bits / 8;
1525         }
1526
1527         return bpc * chan;
1528 }
1529
1530 /* Indices for named (non-XFB) varyings that are present. These are packed
1531  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1532  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1533  * of a given special field given a shift S by:
1534  *
1535  *      idx = popcount(P & ((1 << S) - 1))
1536  *
1537  * That is... look at all of the varyings that come earlier and count them, the
1538  * count is the new index since plus one. Likewise, the total number of special
1539  * buffers required is simply popcount(P)
1540  */
1541
1542 enum pan_special_varying {
1543         PAN_VARY_GENERAL = 0,
1544         PAN_VARY_POSITION = 1,
1545         PAN_VARY_PSIZ = 2,
1546         PAN_VARY_PNTCOORD = 3,
1547         PAN_VARY_FACE = 4,
1548         PAN_VARY_FRAGCOORD = 5,
1549
1550         /* Keep last */
1551         PAN_VARY_MAX,
1552 };
1553
1554 /* Given a varying, figure out which index it correpsonds to */
1555
1556 static inline unsigned
1557 pan_varying_index(unsigned present, enum pan_special_varying v)
1558 {
1559         unsigned mask = (1 << v) - 1;
1560         return util_bitcount(present & mask);
1561 }
1562
1563 /* Get the base offset for XFB buffers, which by convention come after
1564  * everything else. Wrapper function for semantic reasons; by construction this
1565  * is just popcount. */
1566
1567 static inline unsigned
1568 pan_xfb_base(unsigned present)
1569 {
1570         return util_bitcount(present);
1571 }
1572
1573 /* Computes the present mask for varyings so we can start emitting varying records */
1574
1575 static inline unsigned
1576 pan_varying_present(
1577         struct panfrost_shader_state *vs,
1578         struct panfrost_shader_state *fs,
1579         unsigned quirks)
1580 {
1581         /* At the moment we always emit general and position buffers. Not
1582          * strictly necessary but usually harmless */
1583
1584         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1585
1586         /* Enable special buffers by the shader info */
1587
1588         if (vs->writes_point_size)
1589                 present |= (1 << PAN_VARY_PSIZ);
1590
1591         if (fs->reads_point_coord)
1592                 present |= (1 << PAN_VARY_PNTCOORD);
1593
1594         if (fs->reads_face)
1595                 present |= (1 << PAN_VARY_FACE);
1596
1597         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1598                 present |= (1 << PAN_VARY_FRAGCOORD);
1599
1600         /* Also, if we have a point sprite, we need a point coord buffer */
1601
1602         for (unsigned i = 0; i < fs->varying_count; i++)  {
1603                 gl_varying_slot loc = fs->varyings_loc[i];
1604
1605                 if (has_point_coord(fs->point_sprite_mask, loc))
1606                         present |= (1 << PAN_VARY_PNTCOORD);
1607         }
1608
1609         return present;
1610 }
1611
1612 /* Emitters for varying records */
1613
1614 static void
1615 pan_emit_vary(struct mali_attribute_packed *out,
1616                 unsigned present, enum pan_special_varying buf,
1617                 unsigned quirks, enum mali_format format,
1618                 unsigned offset)
1619 {
1620         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1621         unsigned swizzle = quirks & HAS_SWIZZLES ?
1622                         panfrost_get_default_swizzle(nr_channels) :
1623                         panfrost_bifrost_swizzle(nr_channels);
1624
1625         pan_pack(out, ATTRIBUTE, cfg) {
1626                 cfg.buffer_index = pan_varying_index(present, buf);
1627                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1628                 cfg.format = (format << 12) | swizzle;
1629                 cfg.offset = offset;
1630         }
1631 }
1632
1633 /* General varying that is unused */
1634
1635 static void
1636 pan_emit_vary_only(struct mali_attribute_packed *out,
1637                 unsigned present, unsigned quirks)
1638 {
1639         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1640 }
1641
1642 /* Special records */
1643
1644 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1645         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1646         [PAN_VARY_PSIZ]         = MALI_R16F,
1647         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1648         [PAN_VARY_FACE]         = MALI_R32I,
1649         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1650 };
1651
1652 static void
1653 pan_emit_vary_special(struct mali_attribute_packed *out,
1654                 unsigned present, enum pan_special_varying buf,
1655                 unsigned quirks)
1656 {
1657         assert(buf < PAN_VARY_MAX);
1658         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1659 }
1660
1661 static enum mali_format
1662 pan_xfb_format(enum mali_format format, unsigned nr)
1663 {
1664         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1665                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1666         else
1667                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1668 }
1669
1670 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1671  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1672  * value. */
1673
1674 static void
1675 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1676                 unsigned present,
1677                 unsigned max_xfb,
1678                 unsigned *streamout_offsets,
1679                 unsigned quirks,
1680                 enum mali_format format,
1681                 struct pipe_stream_output o)
1682 {
1683         unsigned swizzle = quirks & HAS_SWIZZLES ?
1684                         panfrost_get_default_swizzle(o.num_components) :
1685                         panfrost_bifrost_swizzle(o.num_components);
1686
1687         pan_pack(out, ATTRIBUTE, cfg) {
1688                 /* XFB buffers come after everything else */
1689                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1690                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1691
1692                 /* Override number of channels and precision to highp */
1693                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1694
1695                 /* Apply given offsets together */
1696                 cfg.offset = (o.dst_offset * 4) /* dwords */
1697                         + streamout_offsets[o.output_buffer];
1698         }
1699 }
1700
1701 /* Determine if we should capture a varying for XFB. This requires actually
1702  * having a buffer for it. If we don't capture it, we'll fallback to a general
1703  * varying path (linked or unlinked, possibly discarding the write) */
1704
1705 static bool
1706 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1707                 unsigned loc, unsigned max_xfb)
1708 {
1709         if (!(xfb->so_mask & (1ll << loc)))
1710                 return false;
1711
1712         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1713         return o->output_buffer < max_xfb;
1714 }
1715
1716 static void
1717 pan_emit_general_varying(struct mali_attribute_packed *out,
1718                 struct panfrost_shader_state *other,
1719                 struct panfrost_shader_state *xfb,
1720                 gl_varying_slot loc,
1721                 enum mali_format format,
1722                 unsigned present,
1723                 unsigned quirks,
1724                 unsigned *gen_offsets,
1725                 enum mali_format *gen_formats,
1726                 unsigned *gen_stride,
1727                 unsigned idx,
1728                 bool should_alloc)
1729 {
1730         /* Check if we're linked */
1731         signed other_idx = -1;
1732
1733         for (unsigned j = 0; j < other->varying_count; ++j) {
1734                 if (other->varyings_loc[j] == loc) {
1735                         other_idx = j;
1736                         break;
1737                 }
1738         }
1739
1740         if (other_idx < 0) {
1741                 pan_emit_vary_only(out, present, quirks);
1742                 return;
1743         }
1744
1745         unsigned offset = gen_offsets[other_idx];
1746
1747         if (should_alloc) {
1748                 /* We're linked, so allocate a space via a watermark allocation */
1749                 enum mali_format alt = other->varyings[other_idx];
1750
1751                 /* Do interpolation at minimum precision */
1752                 unsigned size_main = pan_varying_size(format);
1753                 unsigned size_alt = pan_varying_size(alt);
1754                 unsigned size = MIN2(size_main, size_alt);
1755
1756                 /* If a varying is marked for XFB but not actually captured, we
1757                  * should match the format to the format that would otherwise
1758                  * be used for XFB, since dEQP checks for invariance here. It's
1759                  * unclear if this is required by the spec. */
1760
1761                 if (xfb->so_mask & (1ull << loc)) {
1762                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1763                         format = pan_xfb_format(format, o->num_components);
1764                         size = pan_varying_size(format);
1765                 } else if (size == size_alt) {
1766                         format = alt;
1767                 }
1768
1769                 gen_offsets[idx] = *gen_stride;
1770                 gen_formats[other_idx] = format;
1771                 offset = *gen_stride;
1772                 *gen_stride += size;
1773         }
1774
1775         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1776 }
1777
1778 /* Higher-level wrapper around all of the above, classifying a varying into one
1779  * of the above types */
1780
1781 static void
1782 panfrost_emit_varying(
1783                 struct mali_attribute_packed *out,
1784                 struct panfrost_shader_state *stage,
1785                 struct panfrost_shader_state *other,
1786                 struct panfrost_shader_state *xfb,
1787                 unsigned present,
1788                 unsigned max_xfb,
1789                 unsigned *streamout_offsets,
1790                 unsigned quirks,
1791                 unsigned *gen_offsets,
1792                 enum mali_format *gen_formats,
1793                 unsigned *gen_stride,
1794                 unsigned idx,
1795                 bool should_alloc,
1796                 bool is_fragment)
1797 {
1798         gl_varying_slot loc = stage->varyings_loc[idx];
1799         enum mali_format format = stage->varyings[idx];
1800
1801         /* Override format to match linkage */
1802         if (!should_alloc && gen_formats[idx])
1803                 format = gen_formats[idx];
1804
1805         if (has_point_coord(stage->point_sprite_mask, loc)) {
1806                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1807         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1808                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1809                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1810         } else if (loc == VARYING_SLOT_POS) {
1811                 if (is_fragment)
1812                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1813                 else
1814                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1815         } else if (loc == VARYING_SLOT_PSIZ) {
1816                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1817         } else if (loc == VARYING_SLOT_PNTC) {
1818                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1819         } else if (loc == VARYING_SLOT_FACE) {
1820                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1821         } else {
1822                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1823                                 quirks, gen_offsets, gen_formats, gen_stride,
1824                                 idx, should_alloc);
1825         }
1826 }
1827
1828 static void
1829 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1830                 unsigned present,
1831                 enum pan_special_varying v,
1832                 unsigned special)
1833 {
1834         if (present & (1 << v)) {
1835                 unsigned idx = pan_varying_index(present, v);
1836
1837                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1838                         cfg.special = special;
1839                         cfg.type = 0;
1840                 }
1841         }
1842 }
1843
1844 void
1845 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1846                                  unsigned vertex_count,
1847                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1848                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1849                                  union midgard_primitive_size *primitive_size)
1850 {
1851         /* Load the shaders */
1852         struct panfrost_context *ctx = batch->ctx;
1853         struct panfrost_device *dev = pan_device(ctx->base.screen);
1854         struct panfrost_shader_state *vs, *fs;
1855         size_t vs_size, fs_size;
1856
1857         /* Allocate the varying descriptor */
1858
1859         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1860         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1861         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1862         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1863
1864         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1865                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1866
1867         struct pipe_stream_output_info *so = &vs->stream_output;
1868         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1869
1870         /* Check if this varying is linked by us. This is the case for
1871          * general-purpose, non-captured varyings. If it is, link it. If it's
1872          * not, use the provided stream out information to determine the
1873          * offset, since it was already linked for us. */
1874
1875         unsigned gen_offsets[32];
1876         enum mali_format gen_formats[32];
1877         memset(gen_offsets, 0, sizeof(gen_offsets));
1878         memset(gen_formats, 0, sizeof(gen_formats));
1879
1880         unsigned gen_stride = 0;
1881         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1882         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1883
1884         unsigned streamout_offsets[32];
1885
1886         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1887                 streamout_offsets[i] = panfrost_streamout_offset(
1888                                         so->stride[i],
1889                                         ctx->streamout.offsets[i],
1890                                         ctx->streamout.targets[i]);
1891         }
1892
1893         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1894         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1895
1896         for (unsigned i = 0; i < vs->varying_count; i++) {
1897                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1898                                 ctx->streamout.num_targets, streamout_offsets,
1899                                 dev->quirks,
1900                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1901         }
1902
1903         for (unsigned i = 0; i < fs->varying_count; i++) {
1904                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1905                                 ctx->streamout.num_targets, streamout_offsets,
1906                                 dev->quirks,
1907                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1908         }
1909
1910         unsigned xfb_base = pan_xfb_base(present);
1911         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1912                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1913                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1914         struct mali_attribute_buffer_packed *varyings =
1915                 (struct mali_attribute_buffer_packed *) T.cpu;
1916
1917         /* Emit the stream out buffers */
1918
1919         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1920                                                            ctx->vertex_count);
1921
1922         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1923                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1924                                         so->stride[i],
1925                                         ctx->streamout.offsets[i],
1926                                         out_count,
1927                                         ctx->streamout.targets[i]);
1928         }
1929
1930         panfrost_emit_varyings(batch,
1931                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1932                         gen_stride, vertex_count);
1933
1934         /* fp32 vec4 gl_Position */
1935         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1936                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1937                         sizeof(float) * 4, vertex_count);
1938
1939         if (present & (1 << PAN_VARY_PSIZ)) {
1940                 primitive_size->pointer = panfrost_emit_varyings(batch,
1941                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1942                                 2, vertex_count);
1943         }
1944
1945         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1946         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1947         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1948
1949         vertex_postfix->varyings = T.gpu;
1950         tiler_postfix->varyings = T.gpu;
1951
1952         vertex_postfix->varying_meta = trans.gpu;
1953         tiler_postfix->varying_meta = trans.gpu + vs_size;
1954 }
1955
1956 void
1957 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1958                                 struct mali_vertex_tiler_prefix *vertex_prefix,
1959                                 struct mali_vertex_tiler_postfix *vertex_postfix,
1960                                 struct mali_vertex_tiler_prefix *tiler_prefix,
1961                                 struct mali_vertex_tiler_postfix *tiler_postfix,
1962                                 union midgard_primitive_size *primitive_size)
1963 {
1964         struct panfrost_context *ctx = batch->ctx;
1965         struct panfrost_device *device = pan_device(ctx->base.screen);
1966         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1967         struct bifrost_payload_vertex bifrost_vertex = {0,};
1968         struct bifrost_payload_tiler bifrost_tiler = {0,};
1969         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1970         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1971         void *vp, *tp;
1972         size_t vp_size, tp_size;
1973
1974         if (device->quirks & IS_BIFROST) {
1975                 bifrost_vertex.prefix = *vertex_prefix;
1976                 bifrost_vertex.postfix = *vertex_postfix;
1977                 vp = &bifrost_vertex;
1978                 vp_size = sizeof(bifrost_vertex);
1979
1980                 bifrost_tiler.prefix = *tiler_prefix;
1981                 bifrost_tiler.tiler.primitive_size = *primitive_size;
1982                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1983                 bifrost_tiler.postfix = *tiler_postfix;
1984                 tp = &bifrost_tiler;
1985                 tp_size = sizeof(bifrost_tiler);
1986         } else {
1987                 midgard_vertex.prefix = *vertex_prefix;
1988                 midgard_vertex.postfix = *vertex_postfix;
1989                 vp = &midgard_vertex;
1990                 vp_size = sizeof(midgard_vertex);
1991
1992                 midgard_tiler.prefix = *tiler_prefix;
1993                 midgard_tiler.postfix = *tiler_postfix;
1994                 midgard_tiler.primitive_size = *primitive_size;
1995                 tp = &midgard_tiler;
1996                 tp_size = sizeof(midgard_tiler);
1997         }
1998
1999         if (wallpapering) {
2000                 /* Inject in reverse order, with "predicted" job indices.
2001                  * THIS IS A HACK XXX */
2002                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2003                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2004                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2005                                  vp, vp_size, true);
2006                 return;
2007         }
2008
2009         /* If rasterizer discard is enable, only submit the vertex */
2010
2011         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2012                                            vp, vp_size, false);
2013
2014         if (ctx->rasterizer->base.rasterizer_discard)
2015                 return;
2016
2017         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2018                          false);
2019 }
2020
2021 /* TODO: stop hardcoding this */
2022 mali_ptr
2023 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2024 {
2025         uint16_t locations[] = {
2026             128, 128,
2027             0, 256,
2028             0, 256,
2029             0, 256,
2030             0, 256,
2031             0, 256,
2032             0, 256,
2033             0, 256,
2034             0, 256,
2035             0, 256,
2036             0, 256,
2037             0, 256,
2038             0, 256,
2039             0, 256,
2040             0, 256,
2041             0, 256,
2042             0, 256,
2043             0, 256,
2044             0, 256,
2045             0, 256,
2046             0, 256,
2047             0, 256,
2048             0, 256,
2049             0, 256,
2050             0, 256,
2051             0, 256,
2052             0, 256,
2053             0, 256,
2054             0, 256,
2055             0, 256,
2056             0, 256,
2057             0, 256,
2058             128, 128,
2059             0, 0,
2060             0, 0,
2061             0, 0,
2062             0, 0,
2063             0, 0,
2064             0, 0,
2065             0, 0,
2066             0, 0,
2067             0, 0,
2068             0, 0,
2069             0, 0,
2070             0, 0,
2071             0, 0,
2072             0, 0,
2073             0, 0,
2074         };
2075
2076         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2077 }