src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_compute_shader_meta_init(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         meta->shader = ss->shader;
 319         meta->attribute_count = ss->attribute_count;
 320         meta->varying_count = ss->varying_count;
 321         meta->texture_count = ctx->sampler_view_count[st];
 322         meta->sampler_count = ctx->sampler_count[st];
 323
 324         if (dev->quirks & IS_BIFROST) {
 325                 meta->bifrost1.unk1 = 0x800000;
 326                 meta->bifrost2.preload_regs = 0xC0;
 327                 meta->bifrost2.uniform_count = ss->uniform_count;
 328                 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 329         } else {
 330                 meta->midgard1.uniform_count = ss->uniform_count;
 331                 meta->midgard1.work_count = ss->work_reg_count;
 332
 333                 /* TODO: This is not conformant on ES3 */
 334                 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 335
 336                 meta->midgard1.flags_lo = 0x20;
 337                 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 338
 339                 SET_BIT(meta->midgard1.flags_lo, MALI_WRITES_GLOBAL, ss->writes_global);
 340         }
 341 }
 342
 343 static unsigned
 344 translate_tex_wrap(enum pipe_tex_wrap w)
 345 {
 346         switch (w) {
 347         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 348         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 349         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 350         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 351         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 352         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 353         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 354         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 355         default: unreachable("Invalid wrap");
 356         }
 357 }
 358
 359 /* The hardware compares in the wrong order order, so we have to flip before
 360  * encoding. Yes, really. */
 361
 362 static enum mali_func
 363 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 364 {
 365         if (!cso->compare_mode)
 366                 return MALI_FUNC_NEVER;
 367
 368         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 369         return panfrost_flip_compare_func(f);
 370 }
 371
 372 static enum mali_mipmap_mode
 373 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 374 {
 375         switch (f) {
 376         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 377         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 378         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 379         default: unreachable("Invalid");
 380         }
 381 }
 382
 383 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 384                                 struct mali_midgard_sampler_packed *hw)
 385 {
 386         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 387                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 388                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 389                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 390                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 391                 cfg.normalized_coordinates = cso->normalized_coords;
 392
 393                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 394
 395                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 396
 397                 /* If necessary, we disable mipmapping in the sampler descriptor by
 398                  * clamping the LOD as tight as possible (from 0 to epsilon,
 399                  * essentially -- remember these are fixed point numbers, so
 400                  * epsilon=1/256) */
 401
 402                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 403                         cfg.minimum_lod + 1 :
 404                         FIXED_16(cso->max_lod, false);
 405
 406                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 407                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 408                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 409
 410                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 411                 cfg.seamless_cube_map = cso->seamless_cube_map;
 412
 413                 cfg.border_color_r = cso->border_color.f[0];
 414                 cfg.border_color_g = cso->border_color.f[1];
 415                 cfg.border_color_b = cso->border_color.f[2];
 416                 cfg.border_color_a = cso->border_color.f[3];
 417         }
 418 }
 419
 420 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 421                                         struct mali_bifrost_sampler_packed *hw)
 422 {
 423         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 424                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 425                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 426                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 427                 cfg.normalized_coordinates = cso->normalized_coords;
 428
 429                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 430                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 431                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 432
 433                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 434                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 435                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 436
 437                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 438                 cfg.seamless_cube_map = cso->seamless_cube_map;
 439         }
 440 }
 441
 442 static bool
 443 panfrost_fs_required(
 444                 struct panfrost_shader_state *fs,
 445                 struct panfrost_blend_final *blend,
 446                 unsigned rt_count)
 447 {
 448         /* If we generally have side effects */
 449         if (fs->fs_sidefx)
 450                 return true;
 451
 452         /* If colour is written we need to execute */
 453         for (unsigned i = 0; i < rt_count; ++i) {
 454                 if (!blend[i].no_colour)
 455                         return true;
 456         }
 457
 458         /* If depth is written and not implied we need to execute.
 459          * TODO: Predicate on Z/S writes being enabled */
 460         return (fs->writes_depth || fs->writes_stencil);
 461 }
 462
 463 static void
 464 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 465                 struct panfrost_blend_final *blend)
 466 {
 467         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 468         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 469         unsigned rt_count = batch->key.nr_cbufs;
 470
 471         struct bifrost_blend_rt *brts = rts;
 472         struct midgard_blend_rt *mrts = rts;
 473
 474         /* Disable blending for depth-only on Bifrost */
 475
 476         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 477                 brts[0].unk2 = 0x3;
 478
 479         for (unsigned i = 0; i < rt_count; ++i) {
 480                 unsigned flags = 0;
 481
 482                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 483                         if (blend[i].no_colour) {
 484                                 cfg.enable = false;
 485                                 break;
 486                         }
 487
 488                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 489
 490                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 491                         cfg.load_destination = blend[i].load_dest;
 492                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 493
 494                         if (!(dev->quirks & IS_BIFROST))
 495                                 cfg.midgard_blend_shader = blend[i].is_shader;
 496                 }
 497
 498                 if (dev->quirks & IS_BIFROST) {
 499                         brts[i].flags = flags;
 500
 501                         if (blend[i].is_shader) {
 502                                 /* The blend shader's address needs to be at
 503                                  * the same top 32 bit as the fragment shader.
 504                                  * TODO: Ensure that's always the case.
 505                                  */
 506                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 507                                        (fs->bo->gpu & (0xffffffffull << 32)));
 508                                 brts[i].shader = blend[i].shader.gpu;
 509                                 brts[i].unk2 = 0x0;
 510                         } else {
 511                                 enum pipe_format format = batch->key.cbufs[i]->format;
 512                                 const struct util_format_description *format_desc;
 513                                 format_desc = util_format_description(format);
 514
 515                                 brts[i].equation = blend[i].equation.equation;
 516
 517                                 /* TODO: this is a bit more complicated */
 518                                 brts[i].constant = blend[i].equation.constant;
 519
 520                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 521
 522                                 /* 0x19 disables blending and forces REPLACE
 523                                  * mode (equivalent to rgb_mode = alpha_mode =
 524                                  * x122, colour mask = 0xF). 0x1a allows
 525                                  * blending. */
 526                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 527
 528                                 brts[i].shader_type = fs->blend_types[i];
 529                         }
 530                 } else {
 531                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 532
 533                         if (blend[i].is_shader) {
 534                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 535                         } else {
 536                                 mrts[i].blend.equation = blend[i].equation.equation;
 537                                 mrts[i].blend.constant = blend[i].equation.constant;
 538                         }
 539                 }
 540         }
 541 }
 542
 543 static void
 544 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
 545                                struct mali_shader_meta *fragmeta,
 546                                struct panfrost_blend_final *blend)
 547 {
 548         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 549         struct panfrost_shader_state *fs;
 550
 551         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 552
 553         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 554         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 555
 556         memset(fragmeta, 0, sizeof(*fragmeta));
 557
 558         fragmeta->shader = fs->shader;
 559         fragmeta->attribute_count = fs->attribute_count;
 560         fragmeta->varying_count = fs->varying_count;
 561         fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
 562         fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
 563
 564         if (dev->quirks & IS_BIFROST) {
 565                 /* First clause ATEST |= 0x4000000.
 566                  * Lefs than 32 regs |= 0x200 */
 567                 fragmeta->bifrost1.unk1 = 0x950020;
 568
 569                 fragmeta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 570                 fragmeta->bifrost2.preload_regs = 0x1;
 571                 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
 572
 573                 fragmeta->bifrost2.uniform_count = fs->uniform_count;
 574         } else {
 575                 fragmeta->midgard1.uniform_count = fs->uniform_count;
 576                 fragmeta->midgard1.work_count = fs->work_reg_count;
 577
 578                 /* TODO: This is not conformant on ES3 */
 579                 fragmeta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
 580
 581                 fragmeta->midgard1.flags_lo = 0x20;
 582                 fragmeta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 583
 584                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_GLOBAL, fs->writes_global);
 585         }
 586
 587         bool msaa = rast->multisample;
 588         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 589
 590         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 591         fragmeta->unknown2_4 = 0x4e0;
 592
 593         /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
 594          * is required (independent of 32-bit/64-bit descriptors), or why it's
 595          * not used on later GPU revisions. Otherwise, all shader jobs fault on
 596          * these earlier chips (perhaps this is a chicken bit of some kind).
 597          * More investigation is needed. */
 598
 599         SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
 600
 601         if (dev->quirks & IS_BIFROST) {
 602                 /* TODO */
 603         } else {
 604                 /* Depending on whether it's legal to in the given shader, we try to
 605                  * enable early-z testing. TODO: respect e-z force */
 606
 607                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
 608                         !fs->can_discard && !fs->writes_global &&
 609                         !fs->writes_depth && !fs->writes_stencil &&
 610                         !ctx->blend->base.alpha_to_coverage);
 611
 612                 /* Add the writes Z/S flags if needed. */
 613                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
 614                 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
 615
 616                 /* Any time texturing is used, derivatives are implicitly calculated,
 617                  * so we need to enable helper invocations */
 618
 619                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
 620                         fs->helper_invocations);
 621
 622                 /* If discard is enabled, which bit we set to convey this
 623                  * depends on if depth/stencil is used for the draw or not.
 624                  * Just one of depth OR stencil is enough to trigger this. */
 625
 626                 bool zs_enabled =
 627                         fs->writes_depth || fs->writes_stencil ||
 628                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 629                         zsa->base.stencil[0].enabled;
 630
 631                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
 632                         fs->outputs_read || (!zs_enabled && fs->can_discard));
 633                 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
 634         }
 635
 636         /* TODO: Sample size */
 637         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 638         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 639
 640         /* EXT_shader_framebuffer_fetch requires the shader to be run
 641          * per-sample when outputs are read. */
 642         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 643         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 644
 645         fragmeta->depth_units = rast->offset_units * 2.0f;
 646         fragmeta->depth_factor = rast->offset_scale;
 647
 648         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 649
 650         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 651         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 652
 653         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 654         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 655
 656         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 657                 zsa->base.stencil[0].enabled);
 658
 659         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 660         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 661
 662         /* Bottom bits for stencil ref, exactly one word */
 663         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 664
 665         /* If back-stencil is not enabled, use the front values */
 666
 667         if (zsa->base.stencil[1].enabled)
 668                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 669         else
 670                 fragmeta->stencil_back = fragmeta->stencil_front;
 671
 672         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 673                 zsa->base.depth.writemask);
 674
 675         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 676         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 677                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 678
 679         SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
 680                 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
 681                 !ctx->blend->base.dither);
 682
 683         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 684                         ctx->blend->base.alpha_to_coverage);
 685
 686         /* Get blending setup */
 687         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 688
 689         /* Disable shader execution if we can */
 690         if (dev->quirks & MIDGARD_SHADERLESS
 691                         && !panfrost_fs_required(fs, blend, rt_count)) {
 692                 fragmeta->shader = 0;
 693                 fragmeta->attribute_count = 0;
 694                 fragmeta->varying_count = 0;
 695                 fragmeta->texture_count = 0;
 696                 fragmeta->sampler_count = 0;
 697
 698                 /* This feature is not known to work on Bifrost */
 699                 fragmeta->midgard1.work_count = 1;
 700                 fragmeta->midgard1.uniform_count = 0;
 701                 fragmeta->midgard1.uniform_buffer_count = 0;
 702         }
 703
 704          /* If there is a blend shader, work registers are shared. We impose 8
 705           * work registers as a limit for blend shaders. Should be lower XXX */
 706
 707         if (!(dev->quirks & IS_BIFROST)) {
 708                 for (unsigned c = 0; c < rt_count; ++c) {
 709                         if (blend[c].is_shader) {
 710                                 fragmeta->midgard1.work_count =
 711                                         MAX2(fragmeta->midgard1.work_count, 8);
 712                         }
 713                 }
 714         }
 715
 716         /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
 717          * copied to the blend_meta appended (by convention), but this is the
 718          * field actually read by the hardware. (Or maybe both are read...?).
 719          * Specify the last RTi with a blend shader. */
 720
 721         fragmeta->blend.shader = 0;
 722
 723         for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 724                 if (!blend[rt].is_shader)
 725                         continue;
 726
 727                 fragmeta->blend.shader = blend[rt].shader.gpu |
 728                                          blend[rt].shader.first_tag;
 729                 break;
 730         }
 731
 732         if (dev->quirks & MIDGARD_SFBD) {
 733                 /* When only a single render target platform is used, the blend
 734                  * information is inside the shader meta itself. We additionally
 735                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 736                  * we're able to read back the destination buffer) */
 737
 738                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 739                         blend[0].is_shader);
 740
 741                 if (!blend[0].is_shader) {
 742                         fragmeta->blend.equation = blend[0].equation.equation;
 743                         fragmeta->blend.constant = blend[0].equation.constant;
 744                 }
 745
 746                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 747                         blend[0].load_dest);
 748         }
 749
 750         if (dev->quirks & IS_BIFROST) {
 751                 bool no_blend = true;
 752
 753                 for (unsigned i = 0; i < rt_count; ++i)
 754                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 755
 756                 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
 757                         !fs->can_discard && !fs->writes_depth && no_blend);
 758         }
 759 }
 760
 761 void
 762 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 763                           enum pipe_shader_type st,
 764                           struct mali_vertex_tiler_postfix *postfix)
 765 {
 766         struct panfrost_context *ctx = batch->ctx;
 767         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 768
 769         if (!ss) {
 770                 postfix->shader = 0;
 771                 return;
 772         }
 773
 774         struct mali_shader_meta meta;
 775
 776         /* Add the shader BO to the batch. */
 777         panfrost_batch_add_bo(batch, ss->bo,
 778                               PAN_BO_ACCESS_PRIVATE |
 779                               PAN_BO_ACCESS_READ |
 780                               panfrost_bo_access_for_stage(st));
 781
 782         mali_ptr shader_ptr;
 783
 784         if (st == PIPE_SHADER_FRAGMENT) {
 785                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 786                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 787                 size_t desc_size = sizeof(meta);
 788                 void *rts = NULL;
 789                 struct panfrost_transfer xfer;
 790                 unsigned rt_size;
 791
 792                 if (dev->quirks & MIDGARD_SFBD)
 793                         rt_size = 0;
 794                 else if (dev->quirks & IS_BIFROST)
 795                         rt_size = sizeof(struct bifrost_blend_rt);
 796                 else
 797                         rt_size = sizeof(struct midgard_blend_rt);
 798
 799                 desc_size += rt_size * rt_count;
 800
 801                 if (rt_size)
 802                         rts = rzalloc_size(ctx, rt_size * rt_count);
 803
 804                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 805
 806                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 807                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 808
 809                 panfrost_frag_shader_meta_init(ctx, &meta, blend);
 810
 811                 if (!(dev->quirks & MIDGARD_SFBD))
 812                         panfrost_emit_blend(batch, rts, blend);
 813                 else
 814                         batch->draws |= PIPE_CLEAR_COLOR0;
 815
 816                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 817
 818                 memcpy(xfer.cpu, &meta, sizeof(meta));
 819                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 820
 821                 if (rt_size)
 822                         ralloc_free(rts);
 823
 824                 shader_ptr = xfer.gpu;
 825         } else {
 826                 panfrost_compute_shader_meta_init(ctx, st, &meta);
 827
 828                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 829                                                        sizeof(meta));
 830         }
 831
 832         postfix->shader = shader_ptr;
 833 }
 834
 835 void
 836 panfrost_emit_viewport(struct panfrost_batch *batch,
 837                        struct mali_vertex_tiler_postfix *tiler_postfix)
 838 {
 839         struct panfrost_context *ctx = batch->ctx;
 840         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 841         const struct pipe_scissor_state *ss = &ctx->scissor;
 842         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 843         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 844
 845         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 846          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 847          * translate + |scale|, so the ordering is correct here. */
 848         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 849         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 850         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 851         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 852         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 853         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 854
 855         /* Scissor to the intersection of viewport and to the scissor, clamped
 856          * to the framebuffer */
 857
 858         unsigned minx = MIN2(fb->width, vp_minx);
 859         unsigned maxx = MIN2(fb->width, vp_maxx);
 860         unsigned miny = MIN2(fb->height, vp_miny);
 861         unsigned maxy = MIN2(fb->height, vp_maxy);
 862
 863         if (ss && rast->scissor) {
 864                 minx = MAX2(ss->minx, minx);
 865                 miny = MAX2(ss->miny, miny);
 866                 maxx = MIN2(ss->maxx, maxx);
 867                 maxy = MIN2(ss->maxy, maxy);
 868         }
 869
 870         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 871
 872         pan_pack(T.cpu, VIEWPORT, cfg) {
 873                 cfg.scissor_minimum_x = minx;
 874                 cfg.scissor_minimum_y = miny;
 875                 cfg.scissor_maximum_x = maxx - 1;
 876                 cfg.scissor_maximum_y = maxy - 1;
 877
 878                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 879                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 880         }
 881
 882         tiler_postfix->viewport = T.gpu;
 883         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 884 }
 885
 886 static mali_ptr
 887 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 888                                  enum pipe_shader_type st,
 889                                  struct panfrost_constant_buffer *buf,
 890                                  unsigned index)
 891 {
 892         struct pipe_constant_buffer *cb = &buf->cb[index];
 893         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 894
 895         if (rsrc) {
 896                 panfrost_batch_add_bo(batch, rsrc->bo,
 897                                       PAN_BO_ACCESS_SHARED |
 898                                       PAN_BO_ACCESS_READ |
 899                                       panfrost_bo_access_for_stage(st));
 900
 901                 /* Alignment gauranteed by
 902                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 903                 return rsrc->bo->gpu + cb->buffer_offset;
 904         } else if (cb->user_buffer) {
 905                 return panfrost_pool_upload_aligned(&batch->pool,
 906                                                  cb->user_buffer +
 907                                                  cb->buffer_offset,
 908                                                  cb->buffer_size, 16);
 909         } else {
 910                 unreachable("No constant buffer");
 911         }
 912 }
 913
 914 struct sysval_uniform {
 915         union {
 916                 float f[4];
 917                 int32_t i[4];
 918                 uint32_t u[4];
 919                 uint64_t du[2];
 920         };
 921 };
 922
 923 static void
 924 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 925                                       struct sysval_uniform *uniform)
 926 {
 927         struct panfrost_context *ctx = batch->ctx;
 928         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 929
 930         uniform->f[0] = vp->scale[0];
 931         uniform->f[1] = vp->scale[1];
 932         uniform->f[2] = vp->scale[2];
 933 }
 934
 935 static void
 936 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 937                                        struct sysval_uniform *uniform)
 938 {
 939         struct panfrost_context *ctx = batch->ctx;
 940         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 941
 942         uniform->f[0] = vp->translate[0];
 943         uniform->f[1] = vp->translate[1];
 944         uniform->f[2] = vp->translate[2];
 945 }
 946
 947 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 948                                        enum pipe_shader_type st,
 949                                        unsigned int sysvalid,
 950                                        struct sysval_uniform *uniform)
 951 {
 952         struct panfrost_context *ctx = batch->ctx;
 953         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 954         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 955         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 956         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 957
 958         assert(dim);
 959         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 960
 961         if (dim > 1)
 962                 uniform->i[1] = u_minify(tex->texture->height0,
 963                                          tex->u.tex.first_level);
 964
 965         if (dim > 2)
 966                 uniform->i[2] = u_minify(tex->texture->depth0,
 967                                          tex->u.tex.first_level);
 968
 969         if (is_array)
 970                 uniform->i[dim] = tex->texture->array_size;
 971 }
 972
 973 static void
 974 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 975                             enum pipe_shader_type st,
 976                             unsigned ssbo_id,
 977                             struct sysval_uniform *uniform)
 978 {
 979         struct panfrost_context *ctx = batch->ctx;
 980
 981         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 982         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 983
 984         /* Compute address */
 985         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 986
 987         panfrost_batch_add_bo(batch, bo,
 988                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 989                               panfrost_bo_access_for_stage(st));
 990
 991         /* Upload address and size as sysval */
 992         uniform->du[0] = bo->gpu + sb.buffer_offset;
 993         uniform->u[2] = sb.buffer_size;
 994 }
 995
 996 static void
 997 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 998                                enum pipe_shader_type st,
 999                                unsigned samp_idx,
1000                                struct sysval_uniform *uniform)
1001 {
1002         struct panfrost_context *ctx = batch->ctx;
1003         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1004
1005         uniform->f[0] = sampl->min_lod;
1006         uniform->f[1] = sampl->max_lod;
1007         uniform->f[2] = sampl->lod_bias;
1008
1009         /* Even without any errata, Midgard represents "no mipmapping" as
1010          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1011          * panfrost_create_sampler_state which also explains our choice of
1012          * epsilon value (again to keep behaviour consistent) */
1013
1014         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1015                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1016 }
1017
1018 static void
1019 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1020                                        struct sysval_uniform *uniform)
1021 {
1022         struct panfrost_context *ctx = batch->ctx;
1023
1024         uniform->u[0] = ctx->compute_grid->grid[0];
1025         uniform->u[1] = ctx->compute_grid->grid[1];
1026         uniform->u[2] = ctx->compute_grid->grid[2];
1027 }
1028
1029 static void
1030 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1031                         struct panfrost_shader_state *ss,
1032                         enum pipe_shader_type st)
1033 {
1034         struct sysval_uniform *uniforms = (void *)buf;
1035
1036         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1037                 int sysval = ss->sysval[i];
1038
1039                 switch (PAN_SYSVAL_TYPE(sysval)) {
1040                 case PAN_SYSVAL_VIEWPORT_SCALE:
1041                         panfrost_upload_viewport_scale_sysval(batch,
1042                                                               &uniforms[i]);
1043                         break;
1044                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1045                         panfrost_upload_viewport_offset_sysval(batch,
1046                                                                &uniforms[i]);
1047                         break;
1048                 case PAN_SYSVAL_TEXTURE_SIZE:
1049                         panfrost_upload_txs_sysval(batch, st,
1050                                                    PAN_SYSVAL_ID(sysval),
1051                                                    &uniforms[i]);
1052                         break;
1053                 case PAN_SYSVAL_SSBO:
1054                         panfrost_upload_ssbo_sysval(batch, st,
1055                                                     PAN_SYSVAL_ID(sysval),
1056                                                     &uniforms[i]);
1057                         break;
1058                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1059                         panfrost_upload_num_work_groups_sysval(batch,
1060                                                                &uniforms[i]);
1061                         break;
1062                 case PAN_SYSVAL_SAMPLER:
1063                         panfrost_upload_sampler_sysval(batch, st,
1064                                                        PAN_SYSVAL_ID(sysval),
1065                                                        &uniforms[i]);
1066                         break;
1067                 default:
1068                         assert(0);
1069                 }
1070         }
1071 }
1072
1073 static const void *
1074 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1075                                  unsigned index)
1076 {
1077         struct pipe_constant_buffer *cb = &buf->cb[index];
1078         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1079
1080         if (rsrc)
1081                 return rsrc->bo->cpu;
1082         else if (cb->user_buffer)
1083                 return cb->user_buffer;
1084         else
1085                 unreachable("No constant buffer");
1086 }
1087
1088 void
1089 panfrost_emit_const_buf(struct panfrost_batch *batch,
1090                         enum pipe_shader_type stage,
1091                         struct mali_vertex_tiler_postfix *postfix)
1092 {
1093         struct panfrost_context *ctx = batch->ctx;
1094         struct panfrost_shader_variants *all = ctx->shader[stage];
1095
1096         if (!all)
1097                 return;
1098
1099         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1100
1101         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1102
1103         /* Uniforms are implicitly UBO #0 */
1104         bool has_uniforms = buf->enabled_mask & (1 << 0);
1105
1106         /* Allocate room for the sysval and the uniforms */
1107         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1108         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1109         size_t size = sys_size + uniform_size;
1110         struct panfrost_transfer transfer =
1111                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1112
1113         /* Upload sysvals requested by the shader */
1114         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1115
1116         /* Upload uniforms */
1117         if (has_uniforms && uniform_size) {
1118                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1119                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1120         }
1121
1122         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1123          * uploaded */
1124
1125         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1126         assert(ubo_count >= 1);
1127
1128         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1129         struct panfrost_transfer ubos =
1130                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1131                                 MALI_UNIFORM_BUFFER_LENGTH);
1132
1133         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1134
1135         /* Upload uniforms as a UBO */
1136
1137         if (size) {
1138                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1139                         cfg.entries = DIV_ROUND_UP(size, 16);
1140                         cfg.pointer = transfer.gpu;
1141                 }
1142         } else {
1143                 *ubo_ptr = 0;
1144         }
1145
1146         /* The rest are honest-to-goodness UBOs */
1147
1148         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1149                 size_t usz = buf->cb[ubo].buffer_size;
1150                 bool enabled = buf->enabled_mask & (1 << ubo);
1151                 bool empty = usz == 0;
1152
1153                 if (!enabled || empty) {
1154                         ubo_ptr[ubo] = 0;
1155                         continue;
1156                 }
1157
1158                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1159                         cfg.entries = DIV_ROUND_UP(usz, 16);
1160                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1161                                         stage, buf, ubo);
1162                 }
1163         }
1164
1165         postfix->uniforms = transfer.gpu;
1166         postfix->uniform_buffers = ubos.gpu;
1167
1168         buf->dirty_mask = 0;
1169 }
1170
1171 void
1172 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1173                             const struct pipe_grid_info *info,
1174                             struct midgard_payload_vertex_tiler *vtp)
1175 {
1176         struct panfrost_context *ctx = batch->ctx;
1177         struct panfrost_device *dev = pan_device(ctx->base.screen);
1178         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1179         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1180         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1181                                                            128));
1182
1183         unsigned log2_instances =
1184                 util_logbase2_ceil(info->grid[0]) +
1185                 util_logbase2_ceil(info->grid[1]) +
1186                 util_logbase2_ceil(info->grid[2]);
1187
1188         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1189         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1190                                                                   shared_size,
1191                                                                   1);
1192
1193         struct mali_shared_memory shared = {
1194                 .shared_memory = bo->gpu,
1195                 .shared_workgroup_count = log2_instances,
1196                 .shared_shift = util_logbase2(single_size) + 1
1197         };
1198
1199         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1200                                                                sizeof(shared), 64);
1201 }
1202
1203 static mali_ptr
1204 panfrost_get_tex_desc(struct panfrost_batch *batch,
1205                       enum pipe_shader_type st,
1206                       struct panfrost_sampler_view *view)
1207 {
1208         if (!view)
1209                 return (mali_ptr) 0;
1210
1211         struct pipe_sampler_view *pview = &view->base;
1212         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1213
1214         /* Add the BO to the job so it's retained until the job is done. */
1215
1216         panfrost_batch_add_bo(batch, rsrc->bo,
1217                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1218                               panfrost_bo_access_for_stage(st));
1219
1220         panfrost_batch_add_bo(batch, view->bo,
1221                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1222                               panfrost_bo_access_for_stage(st));
1223
1224         return view->bo->gpu;
1225 }
1226
1227 static void
1228 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1229                              struct pipe_context *pctx)
1230 {
1231         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1232         if (view->texture_bo != rsrc->bo->gpu ||
1233             view->modifier != rsrc->modifier) {
1234                 panfrost_bo_unreference(view->bo);
1235                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1236         }
1237 }
1238
1239 void
1240 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1241                                   enum pipe_shader_type stage,
1242                                   struct mali_vertex_tiler_postfix *postfix)
1243 {
1244         struct panfrost_context *ctx = batch->ctx;
1245         struct panfrost_device *device = pan_device(ctx->base.screen);
1246
1247         if (!ctx->sampler_view_count[stage])
1248                 return;
1249
1250         if (device->quirks & IS_BIFROST) {
1251                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1252                                 MALI_BIFROST_TEXTURE_LENGTH *
1253                                 ctx->sampler_view_count[stage],
1254                                 MALI_BIFROST_TEXTURE_LENGTH);
1255
1256                 struct mali_bifrost_texture_packed *out =
1257                         (struct mali_bifrost_texture_packed *) T.cpu;
1258
1259                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1260                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1261                         struct pipe_sampler_view *pview = &view->base;
1262                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1263
1264                         panfrost_update_sampler_view(view, &ctx->base);
1265                         out[i] = view->bifrost_descriptor;
1266
1267                         /* Add the BOs to the job so they are retained until the job is done. */
1268
1269                         panfrost_batch_add_bo(batch, rsrc->bo,
1270                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1271                                               panfrost_bo_access_for_stage(stage));
1272
1273                         panfrost_batch_add_bo(batch, view->bo,
1274                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1275                                               panfrost_bo_access_for_stage(stage));
1276                 }
1277
1278                 postfix->textures = T.gpu;
1279         } else {
1280                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1281
1282                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1283                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1284
1285                         panfrost_update_sampler_view(view, &ctx->base);
1286
1287                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1288                 }
1289
1290                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1291                                                               trampolines,
1292                                                               sizeof(uint64_t) *
1293                                                               ctx->sampler_view_count[stage],
1294                                                               sizeof(uint64_t));
1295         }
1296 }
1297
1298 void
1299 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1300                                   enum pipe_shader_type stage,
1301                                   struct mali_vertex_tiler_postfix *postfix)
1302 {
1303         struct panfrost_context *ctx = batch->ctx;
1304
1305         if (!ctx->sampler_count[stage])
1306                 return;
1307
1308         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1309         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1310
1311         size_t sz = desc_size * ctx->sampler_count[stage];
1312         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1313         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1314
1315         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1316                 out[i] = ctx->samplers[stage][i]->hw;
1317
1318         postfix->sampler_descriptor = T.gpu;
1319 }
1320
1321 void
1322 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1323                           struct mali_vertex_tiler_postfix *vertex_postfix)
1324 {
1325         struct panfrost_context *ctx = batch->ctx;
1326         struct panfrost_vertex_state *so = ctx->vertex;
1327         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1328
1329         unsigned instance_shift = vertex_postfix->instance_shift;
1330         unsigned instance_odd = vertex_postfix->instance_odd;
1331
1332         /* Worst case: everything is NPOT, which is only possible if instancing
1333          * is enabled. Otherwise single record is gauranteed */
1334         bool could_npot = instance_shift || instance_odd;
1335
1336         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1337                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1338                         (could_npot ? 2 : 1),
1339                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1340
1341         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1342                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1343                         MALI_ATTRIBUTE_LENGTH);
1344
1345         struct mali_attribute_buffer_packed *bufs =
1346                 (struct mali_attribute_buffer_packed *) S.cpu;
1347
1348         struct mali_attribute_packed *out =
1349                 (struct mali_attribute_packed *) T.cpu;
1350
1351         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1352         unsigned k = 0;
1353
1354         for (unsigned i = 0; i < so->num_elements; ++i) {
1355                 /* We map buffers 1:1 with the attributes, which
1356                  * means duplicating some vertex buffers (who cares? aside from
1357                  * maybe some caching implications but I somehow doubt that
1358                  * matters) */
1359
1360                 struct pipe_vertex_element *elem = &so->pipe[i];
1361                 unsigned vbi = elem->vertex_buffer_index;
1362                 attrib_to_buffer[i] = k;
1363
1364                 if (!(ctx->vb_mask & (1 << vbi)))
1365                         continue;
1366
1367                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1368                 struct panfrost_resource *rsrc;
1369
1370                 rsrc = pan_resource(buf->buffer.resource);
1371                 if (!rsrc)
1372                         continue;
1373
1374                 /* Add a dependency of the batch on the vertex buffer */
1375                 panfrost_batch_add_bo(batch, rsrc->bo,
1376                                       PAN_BO_ACCESS_SHARED |
1377                                       PAN_BO_ACCESS_READ |
1378                                       PAN_BO_ACCESS_VERTEX_TILER);
1379
1380                 /* Mask off lower bits, see offset fixup below */
1381                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1382                 mali_ptr addr = raw_addr & ~63;
1383
1384                 /* Since we advanced the base pointer, we shrink the buffer
1385                  * size, but add the offset we subtracted */
1386                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1387                         - buf->buffer_offset;
1388
1389                 /* When there is a divisor, the hardware-level divisor is
1390                  * the product of the instance divisor and the padded count */
1391                 unsigned divisor = elem->instance_divisor;
1392                 unsigned hw_divisor = ctx->padded_count * divisor;
1393                 unsigned stride = buf->stride;
1394
1395                 /* If there's a divisor(=1) but no instancing, we want every
1396                  * attribute to be the same */
1397
1398                 if (divisor && ctx->instance_count == 1)
1399                         stride = 0;
1400
1401                 if (!divisor || ctx->instance_count <= 1) {
1402                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1403                                 if (ctx->instance_count > 1)
1404                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1405
1406                                 cfg.pointer = addr;
1407                                 cfg.stride = stride;
1408                                 cfg.size = size;
1409                                 cfg.divisor_r = instance_shift;
1410                                 cfg.divisor_p = instance_odd;
1411                         }
1412                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1413                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1414                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1415                                 cfg.pointer = addr;
1416                                 cfg.stride = stride;
1417                                 cfg.size = size;
1418                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1419                         }
1420
1421                 } else {
1422                         unsigned shift = 0, extra_flags = 0;
1423
1424                         unsigned magic_divisor =
1425                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1426
1427                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1428                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1429                                 cfg.pointer = addr;
1430                                 cfg.stride = stride;
1431                                 cfg.size = size;
1432
1433                                 cfg.divisor_r = shift;
1434                                 cfg.divisor_e = extra_flags;
1435                         }
1436
1437                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1438                                 cfg.divisor_numerator = magic_divisor;
1439                                 cfg.divisor = divisor;
1440                         }
1441
1442                         ++k;
1443                 }
1444
1445                 ++k;
1446         }
1447
1448         /* Add special gl_VertexID/gl_InstanceID buffers */
1449
1450         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1451                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1452
1453                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1454                         cfg.buffer_index = k++;
1455                         cfg.format = so->formats[PAN_VERTEX_ID];
1456                 }
1457
1458                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1459
1460                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1461                         cfg.buffer_index = k++;
1462                         cfg.format = so->formats[PAN_INSTANCE_ID];
1463                 }
1464         }
1465
1466         /* Attribute addresses require 64-byte alignment, so let:
1467          *
1468          *      base' = base & ~63 = base - (base & 63)
1469          *      offset' = offset + (base & 63)
1470          *
1471          * Since base' + offset' = base + offset, these are equivalent
1472          * addressing modes and now base is 64 aligned.
1473          */
1474
1475         unsigned start = vertex_postfix->offset_start;
1476
1477         for (unsigned i = 0; i < so->num_elements; ++i) {
1478                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1479                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1480
1481                 /* Adjust by the masked off bits of the offset. Make sure we
1482                  * read src_offset from so->hw (which is not GPU visible)
1483                  * rather than target (which is) due to caching effects */
1484
1485                 unsigned src_offset = so->pipe[i].src_offset;
1486
1487                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1488                 src_offset += (buf->buffer_offset & 63);
1489
1490                 /* Also, somewhat obscurely per-instance data needs to be
1491                  * offset in response to a delayed start in an indexed draw */
1492
1493                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1494                         src_offset -= buf->stride * start;
1495
1496                 pan_pack(out + i, ATTRIBUTE, cfg) {
1497                         cfg.buffer_index = attrib_to_buffer[i];
1498                         cfg.format = so->formats[i];
1499                         cfg.offset = src_offset;
1500                 }
1501         }
1502
1503         vertex_postfix->attributes = S.gpu;
1504         vertex_postfix->attribute_meta = T.gpu;
1505 }
1506
1507 static mali_ptr
1508 panfrost_emit_varyings(struct panfrost_batch *batch,
1509                 struct mali_attribute_buffer_packed *slot,
1510                 unsigned stride, unsigned count)
1511 {
1512         unsigned size = stride * count;
1513         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1514
1515         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1516                 cfg.stride = stride;
1517                 cfg.size = size;
1518                 cfg.pointer = ptr;
1519         }
1520
1521         return ptr;
1522 }
1523
1524 static unsigned
1525 panfrost_streamout_offset(unsigned stride, unsigned offset,
1526                         struct pipe_stream_output_target *target)
1527 {
1528         return (target->buffer_offset + (offset * stride * 4)) & 63;
1529 }
1530
1531 static void
1532 panfrost_emit_streamout(struct panfrost_batch *batch,
1533                         struct mali_attribute_buffer_packed *slot,
1534                         unsigned stride_words, unsigned offset, unsigned count,
1535                         struct pipe_stream_output_target *target)
1536 {
1537         unsigned stride = stride_words * 4;
1538         unsigned max_size = target->buffer_size;
1539         unsigned expected_size = stride * count;
1540
1541         /* Grab the BO and bind it to the batch */
1542         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1543
1544         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1545          * the perspective of the TILER and FRAGMENT.
1546          */
1547         panfrost_batch_add_bo(batch, bo,
1548                               PAN_BO_ACCESS_SHARED |
1549                               PAN_BO_ACCESS_RW |
1550                               PAN_BO_ACCESS_VERTEX_TILER |
1551                               PAN_BO_ACCESS_FRAGMENT);
1552
1553         /* We will have an offset applied to get alignment */
1554         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1555
1556         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1557                 cfg.pointer = (addr & ~63);
1558                 cfg.stride = stride;
1559                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1560         }
1561 }
1562
1563 static bool
1564 has_point_coord(unsigned mask, gl_varying_slot loc)
1565 {
1566         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1567                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1568         else if (loc == VARYING_SLOT_PNTC)
1569                 return (mask & (1 << 8));
1570         else
1571                 return false;
1572 }
1573
1574 /* Helpers for manipulating stream out information so we can pack varyings
1575  * accordingly. Compute the src_offset for a given captured varying */
1576
1577 static struct pipe_stream_output *
1578 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1579 {
1580         for (unsigned i = 0; i < info->num_outputs; ++i) {
1581                 if (info->output[i].register_index == loc)
1582                         return &info->output[i];
1583         }
1584
1585         unreachable("Varying not captured");
1586 }
1587
1588 static unsigned
1589 pan_varying_size(enum mali_format fmt)
1590 {
1591         unsigned type = MALI_EXTRACT_TYPE(fmt);
1592         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1593         unsigned bits = MALI_EXTRACT_BITS(fmt);
1594         unsigned bpc = 0;
1595
1596         if (bits == MALI_CHANNEL_FLOAT) {
1597                 /* No doubles */
1598                 bool fp16 = (type == MALI_FORMAT_SINT);
1599                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1600
1601                 bpc = fp16 ? 2 : 4;
1602         } else {
1603                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1604
1605                 /* See the enums */
1606                 bits = 1 << bits;
1607                 assert(bits >= 8);
1608                 bpc = bits / 8;
1609         }
1610
1611         return bpc * chan;
1612 }
1613
1614 /* Indices for named (non-XFB) varyings that are present. These are packed
1615  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1616  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1617  * of a given special field given a shift S by:
1618  *
1619  *      idx = popcount(P & ((1 << S) - 1))
1620  *
1621  * That is... look at all of the varyings that come earlier and count them, the
1622  * count is the new index since plus one. Likewise, the total number of special
1623  * buffers required is simply popcount(P)
1624  */
1625
1626 enum pan_special_varying {
1627         PAN_VARY_GENERAL = 0,
1628         PAN_VARY_POSITION = 1,
1629         PAN_VARY_PSIZ = 2,
1630         PAN_VARY_PNTCOORD = 3,
1631         PAN_VARY_FACE = 4,
1632         PAN_VARY_FRAGCOORD = 5,
1633
1634         /* Keep last */
1635         PAN_VARY_MAX,
1636 };
1637
1638 /* Given a varying, figure out which index it correpsonds to */
1639
1640 static inline unsigned
1641 pan_varying_index(unsigned present, enum pan_special_varying v)
1642 {
1643         unsigned mask = (1 << v) - 1;
1644         return util_bitcount(present & mask);
1645 }
1646
1647 /* Get the base offset for XFB buffers, which by convention come after
1648  * everything else. Wrapper function for semantic reasons; by construction this
1649  * is just popcount. */
1650
1651 static inline unsigned
1652 pan_xfb_base(unsigned present)
1653 {
1654         return util_bitcount(present);
1655 }
1656
1657 /* Computes the present mask for varyings so we can start emitting varying records */
1658
1659 static inline unsigned
1660 pan_varying_present(
1661         struct panfrost_shader_state *vs,
1662         struct panfrost_shader_state *fs,
1663         unsigned quirks)
1664 {
1665         /* At the moment we always emit general and position buffers. Not
1666          * strictly necessary but usually harmless */
1667
1668         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1669
1670         /* Enable special buffers by the shader info */
1671
1672         if (vs->writes_point_size)
1673                 present |= (1 << PAN_VARY_PSIZ);
1674
1675         if (fs->reads_point_coord)
1676                 present |= (1 << PAN_VARY_PNTCOORD);
1677
1678         if (fs->reads_face)
1679                 present |= (1 << PAN_VARY_FACE);
1680
1681         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1682                 present |= (1 << PAN_VARY_FRAGCOORD);
1683
1684         /* Also, if we have a point sprite, we need a point coord buffer */
1685
1686         for (unsigned i = 0; i < fs->varying_count; i++)  {
1687                 gl_varying_slot loc = fs->varyings_loc[i];
1688
1689                 if (has_point_coord(fs->point_sprite_mask, loc))
1690                         present |= (1 << PAN_VARY_PNTCOORD);
1691         }
1692
1693         return present;
1694 }
1695
1696 /* Emitters for varying records */
1697
1698 static void
1699 pan_emit_vary(struct mali_attribute_packed *out,
1700                 unsigned present, enum pan_special_varying buf,
1701                 unsigned quirks, enum mali_format format,
1702                 unsigned offset)
1703 {
1704         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1705         unsigned swizzle = quirks & HAS_SWIZZLES ?
1706                         panfrost_get_default_swizzle(nr_channels) :
1707                         panfrost_bifrost_swizzle(nr_channels);
1708
1709         pan_pack(out, ATTRIBUTE, cfg) {
1710                 cfg.buffer_index = pan_varying_index(present, buf);
1711                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1712                 cfg.format = (format << 12) | swizzle;
1713                 cfg.offset = offset;
1714         }
1715 }
1716
1717 /* General varying that is unused */
1718
1719 static void
1720 pan_emit_vary_only(struct mali_attribute_packed *out,
1721                 unsigned present, unsigned quirks)
1722 {
1723         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1724 }
1725
1726 /* Special records */
1727
1728 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1729         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1730         [PAN_VARY_PSIZ]         = MALI_R16F,
1731         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1732         [PAN_VARY_FACE]         = MALI_R32I,
1733         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1734 };
1735
1736 static void
1737 pan_emit_vary_special(struct mali_attribute_packed *out,
1738                 unsigned present, enum pan_special_varying buf,
1739                 unsigned quirks)
1740 {
1741         assert(buf < PAN_VARY_MAX);
1742         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1743 }
1744
1745 static enum mali_format
1746 pan_xfb_format(enum mali_format format, unsigned nr)
1747 {
1748         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1749                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1750         else
1751                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1752 }
1753
1754 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1755  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1756  * value. */
1757
1758 static void
1759 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1760                 unsigned present,
1761                 unsigned max_xfb,
1762                 unsigned *streamout_offsets,
1763                 unsigned quirks,
1764                 enum mali_format format,
1765                 struct pipe_stream_output o)
1766 {
1767         unsigned swizzle = quirks & HAS_SWIZZLES ?
1768                         panfrost_get_default_swizzle(o.num_components) :
1769                         panfrost_bifrost_swizzle(o.num_components);
1770
1771         pan_pack(out, ATTRIBUTE, cfg) {
1772                 /* XFB buffers come after everything else */
1773                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1774                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1775
1776                 /* Override number of channels and precision to highp */
1777                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1778
1779                 /* Apply given offsets together */
1780                 cfg.offset = (o.dst_offset * 4) /* dwords */
1781                         + streamout_offsets[o.output_buffer];
1782         }
1783 }
1784
1785 /* Determine if we should capture a varying for XFB. This requires actually
1786  * having a buffer for it. If we don't capture it, we'll fallback to a general
1787  * varying path (linked or unlinked, possibly discarding the write) */
1788
1789 static bool
1790 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1791                 unsigned loc, unsigned max_xfb)
1792 {
1793         if (!(xfb->so_mask & (1ll << loc)))
1794                 return false;
1795
1796         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1797         return o->output_buffer < max_xfb;
1798 }
1799
1800 static void
1801 pan_emit_general_varying(struct mali_attribute_packed *out,
1802                 struct panfrost_shader_state *other,
1803                 struct panfrost_shader_state *xfb,
1804                 gl_varying_slot loc,
1805                 enum mali_format format,
1806                 unsigned present,
1807                 unsigned quirks,
1808                 unsigned *gen_offsets,
1809                 enum mali_format *gen_formats,
1810                 unsigned *gen_stride,
1811                 unsigned idx,
1812                 bool should_alloc)
1813 {
1814         /* Check if we're linked */
1815         signed other_idx = -1;
1816
1817         for (unsigned j = 0; j < other->varying_count; ++j) {
1818                 if (other->varyings_loc[j] == loc) {
1819                         other_idx = j;
1820                         break;
1821                 }
1822         }
1823
1824         if (other_idx < 0) {
1825                 pan_emit_vary_only(out, present, quirks);
1826                 return;
1827         }
1828
1829         unsigned offset = gen_offsets[other_idx];
1830
1831         if (should_alloc) {
1832                 /* We're linked, so allocate a space via a watermark allocation */
1833                 enum mali_format alt = other->varyings[other_idx];
1834
1835                 /* Do interpolation at minimum precision */
1836                 unsigned size_main = pan_varying_size(format);
1837                 unsigned size_alt = pan_varying_size(alt);
1838                 unsigned size = MIN2(size_main, size_alt);
1839
1840                 /* If a varying is marked for XFB but not actually captured, we
1841                  * should match the format to the format that would otherwise
1842                  * be used for XFB, since dEQP checks for invariance here. It's
1843                  * unclear if this is required by the spec. */
1844
1845                 if (xfb->so_mask & (1ull << loc)) {
1846                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1847                         format = pan_xfb_format(format, o->num_components);
1848                         size = pan_varying_size(format);
1849                 } else if (size == size_alt) {
1850                         format = alt;
1851                 }
1852
1853                 gen_offsets[idx] = *gen_stride;
1854                 gen_formats[other_idx] = format;
1855                 offset = *gen_stride;
1856                 *gen_stride += size;
1857         }
1858
1859         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1860 }
1861
1862 /* Higher-level wrapper around all of the above, classifying a varying into one
1863  * of the above types */
1864
1865 static void
1866 panfrost_emit_varying(
1867                 struct mali_attribute_packed *out,
1868                 struct panfrost_shader_state *stage,
1869                 struct panfrost_shader_state *other,
1870                 struct panfrost_shader_state *xfb,
1871                 unsigned present,
1872                 unsigned max_xfb,
1873                 unsigned *streamout_offsets,
1874                 unsigned quirks,
1875                 unsigned *gen_offsets,
1876                 enum mali_format *gen_formats,
1877                 unsigned *gen_stride,
1878                 unsigned idx,
1879                 bool should_alloc,
1880                 bool is_fragment)
1881 {
1882         gl_varying_slot loc = stage->varyings_loc[idx];
1883         enum mali_format format = stage->varyings[idx];
1884
1885         /* Override format to match linkage */
1886         if (!should_alloc && gen_formats[idx])
1887                 format = gen_formats[idx];
1888
1889         if (has_point_coord(stage->point_sprite_mask, loc)) {
1890                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1891         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1892                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1893                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1894         } else if (loc == VARYING_SLOT_POS) {
1895                 if (is_fragment)
1896                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1897                 else
1898                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1899         } else if (loc == VARYING_SLOT_PSIZ) {
1900                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1901         } else if (loc == VARYING_SLOT_PNTC) {
1902                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1903         } else if (loc == VARYING_SLOT_FACE) {
1904                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1905         } else {
1906                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1907                                 quirks, gen_offsets, gen_formats, gen_stride,
1908                                 idx, should_alloc);
1909         }
1910 }
1911
1912 static void
1913 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1914                 unsigned present,
1915                 enum pan_special_varying v,
1916                 unsigned special)
1917 {
1918         if (present & (1 << v)) {
1919                 unsigned idx = pan_varying_index(present, v);
1920
1921                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1922                         cfg.special = special;
1923                         cfg.type = 0;
1924                 }
1925         }
1926 }
1927
1928 void
1929 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1930                                  unsigned vertex_count,
1931                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1932                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1933                                  union midgard_primitive_size *primitive_size)
1934 {
1935         /* Load the shaders */
1936         struct panfrost_context *ctx = batch->ctx;
1937         struct panfrost_device *dev = pan_device(ctx->base.screen);
1938         struct panfrost_shader_state *vs, *fs;
1939         size_t vs_size, fs_size;
1940
1941         /* Allocate the varying descriptor */
1942
1943         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1944         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1945         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1946         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1947
1948         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1949                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1950
1951         struct pipe_stream_output_info *so = &vs->stream_output;
1952         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1953
1954         /* Check if this varying is linked by us. This is the case for
1955          * general-purpose, non-captured varyings. If it is, link it. If it's
1956          * not, use the provided stream out information to determine the
1957          * offset, since it was already linked for us. */
1958
1959         unsigned gen_offsets[32];
1960         enum mali_format gen_formats[32];
1961         memset(gen_offsets, 0, sizeof(gen_offsets));
1962         memset(gen_formats, 0, sizeof(gen_formats));
1963
1964         unsigned gen_stride = 0;
1965         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1966         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1967
1968         unsigned streamout_offsets[32];
1969
1970         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1971                 streamout_offsets[i] = panfrost_streamout_offset(
1972                                         so->stride[i],
1973                                         ctx->streamout.offsets[i],
1974                                         ctx->streamout.targets[i]);
1975         }
1976
1977         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1978         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1979
1980         for (unsigned i = 0; i < vs->varying_count; i++) {
1981                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1982                                 ctx->streamout.num_targets, streamout_offsets,
1983                                 dev->quirks,
1984                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1985         }
1986
1987         for (unsigned i = 0; i < fs->varying_count; i++) {
1988                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1989                                 ctx->streamout.num_targets, streamout_offsets,
1990                                 dev->quirks,
1991                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1992         }
1993
1994         unsigned xfb_base = pan_xfb_base(present);
1995         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1996                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1997                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1998         struct mali_attribute_buffer_packed *varyings =
1999                 (struct mali_attribute_buffer_packed *) T.cpu;
2000
2001         /* Emit the stream out buffers */
2002
2003         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2004                                                            ctx->vertex_count);
2005
2006         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2007                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2008                                         so->stride[i],
2009                                         ctx->streamout.offsets[i],
2010                                         out_count,
2011                                         ctx->streamout.targets[i]);
2012         }
2013
2014         panfrost_emit_varyings(batch,
2015                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2016                         gen_stride, vertex_count);
2017
2018         /* fp32 vec4 gl_Position */
2019         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2020                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2021                         sizeof(float) * 4, vertex_count);
2022
2023         if (present & (1 << PAN_VARY_PSIZ)) {
2024                 primitive_size->pointer = panfrost_emit_varyings(batch,
2025                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2026                                 2, vertex_count);
2027         }
2028
2029         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2030         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2031         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2032
2033         vertex_postfix->varyings = T.gpu;
2034         tiler_postfix->varyings = T.gpu;
2035
2036         vertex_postfix->varying_meta = trans.gpu;
2037         tiler_postfix->varying_meta = trans.gpu + vs_size;
2038 }
2039
2040 void
2041 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2042                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2043                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2044                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2045                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2046                                 union midgard_primitive_size *primitive_size)
2047 {
2048         struct panfrost_context *ctx = batch->ctx;
2049         struct panfrost_device *device = pan_device(ctx->base.screen);
2050         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2051         struct bifrost_payload_vertex bifrost_vertex = {0,};
2052         struct bifrost_payload_tiler bifrost_tiler = {0,};
2053         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2054         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2055         void *vp, *tp;
2056         size_t vp_size, tp_size;
2057
2058         if (device->quirks & IS_BIFROST) {
2059                 bifrost_vertex.prefix = *vertex_prefix;
2060                 bifrost_vertex.postfix = *vertex_postfix;
2061                 vp = &bifrost_vertex;
2062                 vp_size = sizeof(bifrost_vertex);
2063
2064                 bifrost_tiler.prefix = *tiler_prefix;
2065                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2066                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2067                 bifrost_tiler.postfix = *tiler_postfix;
2068                 tp = &bifrost_tiler;
2069                 tp_size = sizeof(bifrost_tiler);
2070         } else {
2071                 midgard_vertex.prefix = *vertex_prefix;
2072                 midgard_vertex.postfix = *vertex_postfix;
2073                 vp = &midgard_vertex;
2074                 vp_size = sizeof(midgard_vertex);
2075
2076                 midgard_tiler.prefix = *tiler_prefix;
2077                 midgard_tiler.postfix = *tiler_postfix;
2078                 midgard_tiler.primitive_size = *primitive_size;
2079                 tp = &midgard_tiler;
2080                 tp_size = sizeof(midgard_tiler);
2081         }
2082
2083         if (wallpapering) {
2084                 /* Inject in reverse order, with "predicted" job indices.
2085                  * THIS IS A HACK XXX */
2086                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2087                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2088                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2089                                  vp, vp_size, true);
2090                 return;
2091         }
2092
2093         /* If rasterizer discard is enable, only submit the vertex */
2094
2095         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2096                                            vp, vp_size, false);
2097
2098         if (ctx->rasterizer->base.rasterizer_discard)
2099                 return;
2100
2101         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2102                          false);
2103 }
2104
2105 /* TODO: stop hardcoding this */
2106 mali_ptr
2107 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2108 {
2109         uint16_t locations[] = {
2110             128, 128,
2111             0, 256,
2112             0, 256,
2113             0, 256,
2114             0, 256,
2115             0, 256,
2116             0, 256,
2117             0, 256,
2118             0, 256,
2119             0, 256,
2120             0, 256,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             0, 256,
2136             0, 256,
2137             0, 256,
2138             0, 256,
2139             0, 256,
2140             0, 256,
2141             0, 256,
2142             128, 128,
2143             0, 0,
2144             0, 0,
2145             0, 0,
2146             0, 0,
2147             0, 0,
2148             0, 0,
2149             0, 0,
2150             0, 0,
2151             0, 0,
2152             0, 0,
2153             0, 0,
2154             0, 0,
2155             0, 0,
2156             0, 0,
2157             0, 0,
2158         };
2159
2160         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2161 }