src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         memcpy(&meta->shader, &ss->shader, sizeof(ss->shader));
 319
 320         if (dev->quirks & IS_BIFROST) {
 321                 struct mali_bifrost_properties_packed prop;
 322                 struct mali_preload_vertex_packed preload;
 323
 324                 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 325                         cfg.unknown = 0x800000; /* XXX */
 326                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 327                 }
 328
 329                 /* TODO: True compute shaders */
 330                 pan_pack(&preload, PRELOAD_VERTEX, cfg) {
 331                         cfg.uniform_count = ss->uniform_count;
 332                         cfg.vertex_id = true;
 333                         cfg.instance_id = true;
 334                 }
 335
 336                 memcpy(&meta->bifrost_props, &prop, sizeof(prop));
 337                 memcpy(&meta->bifrost_preload, &preload, sizeof(preload));
 338         } else {
 339                 struct mali_midgard_properties_packed prop;
 340
 341                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 342                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
 343                         cfg.uniform_count = ss->uniform_count;
 344                         cfg.work_register_count = ss->work_reg_count;
 345                         cfg.writes_globals = ss->writes_global;
 346                         cfg.suppress_inf_nan = true; /* XXX */
 347                 }
 348
 349                 memcpy(&meta->midgard_props, &prop, sizeof(prop));
 350         }
 351 }
 352
 353 static unsigned
 354 translate_tex_wrap(enum pipe_tex_wrap w)
 355 {
 356         switch (w) {
 357         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 358         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 359         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 360         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 361         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 362         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 363         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 364         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 365         default: unreachable("Invalid wrap");
 366         }
 367 }
 368
 369 /* The hardware compares in the wrong order order, so we have to flip before
 370  * encoding. Yes, really. */
 371
 372 static enum mali_func
 373 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 374 {
 375         if (!cso->compare_mode)
 376                 return MALI_FUNC_NEVER;
 377
 378         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 379         return panfrost_flip_compare_func(f);
 380 }
 381
 382 static enum mali_mipmap_mode
 383 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 384 {
 385         switch (f) {
 386         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 387         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 388         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 389         default: unreachable("Invalid");
 390         }
 391 }
 392
 393 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 394                                 struct mali_midgard_sampler_packed *hw)
 395 {
 396         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 397                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 398                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 399                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 400                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 401                 cfg.normalized_coordinates = cso->normalized_coords;
 402
 403                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 404
 405                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 406
 407                 /* If necessary, we disable mipmapping in the sampler descriptor by
 408                  * clamping the LOD as tight as possible (from 0 to epsilon,
 409                  * essentially -- remember these are fixed point numbers, so
 410                  * epsilon=1/256) */
 411
 412                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 413                         cfg.minimum_lod + 1 :
 414                         FIXED_16(cso->max_lod, false);
 415
 416                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 417                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 418                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 419
 420                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 421                 cfg.seamless_cube_map = cso->seamless_cube_map;
 422
 423                 cfg.border_color_r = cso->border_color.f[0];
 424                 cfg.border_color_g = cso->border_color.f[1];
 425                 cfg.border_color_b = cso->border_color.f[2];
 426                 cfg.border_color_a = cso->border_color.f[3];
 427         }
 428 }
 429
 430 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 431                                         struct mali_bifrost_sampler_packed *hw)
 432 {
 433         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 434                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 435                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 436                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 437                 cfg.normalized_coordinates = cso->normalized_coords;
 438
 439                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 440                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 441                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 442
 443                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 444                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 445                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 446
 447                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 448                 cfg.seamless_cube_map = cso->seamless_cube_map;
 449         }
 450 }
 451
 452 static bool
 453 panfrost_fs_required(
 454                 struct panfrost_shader_state *fs,
 455                 struct panfrost_blend_final *blend,
 456                 unsigned rt_count)
 457 {
 458         /* If we generally have side effects */
 459         if (fs->fs_sidefx)
 460                 return true;
 461
 462         /* If colour is written we need to execute */
 463         for (unsigned i = 0; i < rt_count; ++i) {
 464                 if (!blend[i].no_colour)
 465                         return true;
 466         }
 467
 468         /* If depth is written and not implied we need to execute.
 469          * TODO: Predicate on Z/S writes being enabled */
 470         return (fs->writes_depth || fs->writes_stencil);
 471 }
 472
 473 static void
 474 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 475                 struct panfrost_blend_final *blend)
 476 {
 477         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 478         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 479         unsigned rt_count = batch->key.nr_cbufs;
 480
 481         struct bifrost_blend_rt *brts = rts;
 482         struct midgard_blend_rt *mrts = rts;
 483
 484         /* Disable blending for depth-only on Bifrost */
 485
 486         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 487                 brts[0].unk2 = 0x3;
 488
 489         for (unsigned i = 0; i < rt_count; ++i) {
 490                 unsigned flags = 0;
 491
 492                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 493                         if (blend[i].no_colour) {
 494                                 cfg.enable = false;
 495                                 break;
 496                         }
 497
 498                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 499
 500                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 501                         cfg.load_destination = blend[i].load_dest;
 502                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 503
 504                         if (!(dev->quirks & IS_BIFROST))
 505                                 cfg.midgard_blend_shader = blend[i].is_shader;
 506                 }
 507
 508                 if (dev->quirks & IS_BIFROST) {
 509                         brts[i].flags = flags;
 510
 511                         if (blend[i].is_shader) {
 512                                 /* The blend shader's address needs to be at
 513                                  * the same top 32 bit as the fragment shader.
 514                                  * TODO: Ensure that's always the case.
 515                                  */
 516                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 517                                        (fs->bo->gpu & (0xffffffffull << 32)));
 518                                 brts[i].shader = blend[i].shader.gpu;
 519                                 brts[i].unk2 = 0x0;
 520                         } else {
 521                                 enum pipe_format format = batch->key.cbufs[i]->format;
 522                                 const struct util_format_description *format_desc;
 523                                 format_desc = util_format_description(format);
 524
 525                                 brts[i].equation = blend[i].equation.equation;
 526
 527                                 /* TODO: this is a bit more complicated */
 528                                 brts[i].constant = blend[i].equation.constant;
 529
 530                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 531
 532                                 /* 0x19 disables blending and forces REPLACE
 533                                  * mode (equivalent to rgb_mode = alpha_mode =
 534                                  * x122, colour mask = 0xF). 0x1a allows
 535                                  * blending. */
 536                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 537
 538                                 brts[i].shader_type = fs->blend_types[i];
 539                         }
 540                 } else {
 541                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 542
 543                         if (blend[i].is_shader) {
 544                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 545                         } else {
 546                                 mrts[i].blend.equation = blend[i].equation.equation;
 547                                 mrts[i].blend.constant = blend[i].equation.constant;
 548                         }
 549                 }
 550         }
 551 }
 552
 553 static struct mali_shader_packed
 554 panfrost_pack_shaderless(bool midgard)
 555 {
 556         struct mali_shader_packed pack;
 557
 558         pan_pack(&pack, SHADER, cfg) {
 559                 cfg.shader = midgard ? 0x1 : 0x0;
 560         }
 561
 562         return pack;
 563 }
 564
 565 static void
 566 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 567                                struct mali_shader_meta *fragmeta,
 568                                struct panfrost_blend_final *blend)
 569 {
 570         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 571         struct panfrost_shader_state *fs;
 572
 573         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 574
 575         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 576         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 577         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 578
 579         memset(fragmeta, 0, sizeof(*fragmeta));
 580         memcpy(&fragmeta->shader, &fs->shader, sizeof(fs->shader));
 581
 582         if (dev->quirks & IS_BIFROST) {
 583                 struct mali_bifrost_properties_packed prop;
 584                 struct mali_preload_fragment_packed preload;
 585
 586                 bool no_blend = true;
 587
 588                 for (unsigned i = 0; i < rt_count; ++i)
 589                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 590
 591                 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 592                         cfg.unknown = 0x950020; /* XXX */
 593                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 594                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 595                 }
 596
 597                 pan_pack(&preload, PRELOAD_FRAGMENT, cfg) {
 598                         cfg.uniform_count = fs->uniform_count;
 599                         cfg.fragment_position = fs->reads_frag_coord;
 600                 }
 601
 602                 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
 603                 memcpy(&fragmeta->bifrost_preload, &preload, sizeof(preload));
 604         } else {
 605                 struct mali_midgard_properties_packed prop;
 606
 607                 /* Reasons to disable early-Z from a shader perspective */
 608                 bool late_z = fs->can_discard || fs->writes_global ||
 609                         fs->writes_depth || fs->writes_stencil;
 610
 611                 /* Reasons to disable early-Z from a CSO perspective */
 612                 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 613
 614                 /* If either depth or stencil is enabled, discard matters */
 615                 bool zs_enabled =
 616                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 617                         zsa->base.stencil[0].enabled;
 618
 619                 bool has_blend_shader = false;
 620
 621                 for (unsigned c = 0; c < rt_count; ++c)
 622                         has_blend_shader |= blend[c].is_shader;
 623
 624                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 625                         cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
 626                         cfg.uniform_count = fs->uniform_count;
 627                         cfg.work_register_count = fs->work_reg_count;
 628                         cfg.writes_globals = fs->writes_global;
 629                         cfg.suppress_inf_nan = true; /* XXX */
 630
 631                         /* TODO: Reduce this limit? */
 632                         if (has_blend_shader)
 633                                 cfg.work_register_count = MAX2(cfg.work_register_count, 8);
 634
 635                         cfg.stencil_from_shader = fs->writes_stencil;
 636                         cfg.helper_invocation_enable = fs->helper_invocations;
 637                         cfg.depth_source = fs->writes_depth ?
 638                                 MALI_DEPTH_SOURCE_SHADER :
 639                                 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 640
 641                         /* Depend on other state */
 642                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 643                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 644                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 645                 }
 646
 647                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 648         }
 649
 650         bool msaa = rast->multisample;
 651         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 652
 653         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 654         fragmeta->unknown2_4 = 0x4e0;
 655
 656         /* TODO: Sample size */
 657         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 658         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 659
 660         /* EXT_shader_framebuffer_fetch requires the shader to be run
 661          * per-sample when outputs are read. */
 662         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 663         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 664
 665         fragmeta->depth_units = rast->offset_units * 2.0f;
 666         fragmeta->depth_factor = rast->offset_scale;
 667
 668         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 669
 670         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 671         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 672
 673         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 674         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 675
 676         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 677                 zsa->base.stencil[0].enabled);
 678
 679         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 680         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 681
 682         /* Bottom bits for stencil ref, exactly one word */
 683         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 684
 685         /* If back-stencil is not enabled, use the front values */
 686
 687         if (zsa->base.stencil[1].enabled)
 688                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 689         else
 690                 fragmeta->stencil_back = fragmeta->stencil_front;
 691
 692         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 693                 zsa->base.depth.writemask);
 694
 695         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 696         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 697                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 698
 699         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 700                         ctx->blend->base.alpha_to_coverage);
 701
 702         /* Disable shader execution if we can */
 703         if (!panfrost_fs_required(fs, blend, rt_count)) {
 704                 struct mali_shader_packed shader =
 705                         panfrost_pack_shaderless(!(dev->quirks & IS_BIFROST));
 706
 707                 memcpy(&fragmeta->shader, &shader, sizeof(shader));
 708
 709                 struct mali_midgard_properties_packed prop;
 710
 711                 if (dev->quirks & IS_BIFROST) {
 712                         pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 713                                 cfg.unknown = 0x950020; /* XXX */
 714                                 cfg.early_z_enable = true;
 715                         }
 716                 } else {
 717                         pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 718                                 cfg.work_register_count = 1;
 719                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 720                                 cfg.early_z_enable = true;
 721                         }
 722                 }
 723
 724                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 725         }
 726
 727         if (dev->quirks & MIDGARD_SFBD) {
 728                 /* When only a single render target platform is used, the blend
 729                  * information is inside the shader meta itself. We additionally
 730                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 731                  * we're able to read back the destination buffer) */
 732
 733                 if (blend[0].no_colour)
 734                         return;
 735
 736                 fragmeta->unknown2_4 |= MALI_SFBD_ENABLE;
 737
 738                 SET_BIT(fragmeta->unknown2_4, MALI_SFBD_SRGB,
 739                                 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format));
 740
 741                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 742                         blend[0].is_shader);
 743
 744                 if (blend[0].is_shader) {
 745                         fragmeta->blend.shader = blend[0].shader.gpu |
 746                                 blend[0].shader.first_tag;
 747                 } else {
 748                         fragmeta->blend.equation = blend[0].equation.equation;
 749                         fragmeta->blend.constant = blend[0].equation.constant;
 750                 }
 751
 752                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 753                         blend[0].load_dest);
 754
 755                 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
 756         } else if (!(dev->quirks & IS_BIFROST)) {
 757                 /* Bug where MRT-capable hw apparently reads the last blend
 758                  * shader from here instead of the usual location? */
 759
 760                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 761                         if (!blend[rt].is_shader)
 762                                 continue;
 763
 764                         fragmeta->blend.shader = blend[rt].shader.gpu |
 765                                                  blend[rt].shader.first_tag;
 766                         break;
 767                 }
 768         }
 769 }
 770
 771 void
 772 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 773                           enum pipe_shader_type st,
 774                           struct mali_vertex_tiler_postfix *postfix)
 775 {
 776         struct panfrost_context *ctx = batch->ctx;
 777         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 778
 779         if (!ss) {
 780                 postfix->shader = 0;
 781                 return;
 782         }
 783
 784         struct mali_shader_meta meta;
 785
 786         /* Add the shader BO to the batch. */
 787         panfrost_batch_add_bo(batch, ss->bo,
 788                               PAN_BO_ACCESS_PRIVATE |
 789                               PAN_BO_ACCESS_READ |
 790                               panfrost_bo_access_for_stage(st));
 791
 792         mali_ptr shader_ptr;
 793
 794         if (st == PIPE_SHADER_FRAGMENT) {
 795                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 796                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 797                 size_t desc_size = sizeof(meta);
 798                 void *rts = NULL;
 799                 struct panfrost_transfer xfer;
 800                 unsigned rt_size;
 801
 802                 if (dev->quirks & MIDGARD_SFBD)
 803                         rt_size = 0;
 804                 else if (dev->quirks & IS_BIFROST)
 805                         rt_size = sizeof(struct bifrost_blend_rt);
 806                 else
 807                         rt_size = sizeof(struct midgard_blend_rt);
 808
 809                 desc_size += rt_size * rt_count;
 810
 811                 if (rt_size)
 812                         rts = rzalloc_size(ctx, rt_size * rt_count);
 813
 814                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 815
 816                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 817                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 818
 819                 panfrost_emit_frag_shader(ctx, &meta, blend);
 820
 821                 if (!(dev->quirks & MIDGARD_SFBD))
 822                         panfrost_emit_blend(batch, rts, blend);
 823                 else
 824                         batch->draws |= PIPE_CLEAR_COLOR0;
 825
 826                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 827
 828                 memcpy(xfer.cpu, &meta, sizeof(meta));
 829                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 830
 831                 if (rt_size)
 832                         ralloc_free(rts);
 833
 834                 shader_ptr = xfer.gpu;
 835         } else {
 836                 panfrost_emit_compute_shader(ctx, st, &meta);
 837
 838                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 839                                                        sizeof(meta));
 840         }
 841
 842         postfix->shader = shader_ptr;
 843 }
 844
 845 void
 846 panfrost_emit_viewport(struct panfrost_batch *batch,
 847                        struct mali_vertex_tiler_postfix *tiler_postfix)
 848 {
 849         struct panfrost_context *ctx = batch->ctx;
 850         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 851         const struct pipe_scissor_state *ss = &ctx->scissor;
 852         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 853         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 854
 855         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 856          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 857          * translate + |scale|, so the ordering is correct here. */
 858         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 859         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 860         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 861         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 862         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 863         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 864
 865         /* Scissor to the intersection of viewport and to the scissor, clamped
 866          * to the framebuffer */
 867
 868         unsigned minx = MIN2(fb->width, vp_minx);
 869         unsigned maxx = MIN2(fb->width, vp_maxx);
 870         unsigned miny = MIN2(fb->height, vp_miny);
 871         unsigned maxy = MIN2(fb->height, vp_maxy);
 872
 873         if (ss && rast->scissor) {
 874                 minx = MAX2(ss->minx, minx);
 875                 miny = MAX2(ss->miny, miny);
 876                 maxx = MIN2(ss->maxx, maxx);
 877                 maxy = MIN2(ss->maxy, maxy);
 878         }
 879
 880         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 881
 882         pan_pack(T.cpu, VIEWPORT, cfg) {
 883                 cfg.scissor_minimum_x = minx;
 884                 cfg.scissor_minimum_y = miny;
 885                 cfg.scissor_maximum_x = maxx - 1;
 886                 cfg.scissor_maximum_y = maxy - 1;
 887
 888                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 889                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 890         }
 891
 892         tiler_postfix->viewport = T.gpu;
 893         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 894 }
 895
 896 static mali_ptr
 897 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 898                                  enum pipe_shader_type st,
 899                                  struct panfrost_constant_buffer *buf,
 900                                  unsigned index)
 901 {
 902         struct pipe_constant_buffer *cb = &buf->cb[index];
 903         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 904
 905         if (rsrc) {
 906                 panfrost_batch_add_bo(batch, rsrc->bo,
 907                                       PAN_BO_ACCESS_SHARED |
 908                                       PAN_BO_ACCESS_READ |
 909                                       panfrost_bo_access_for_stage(st));
 910
 911                 /* Alignment gauranteed by
 912                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 913                 return rsrc->bo->gpu + cb->buffer_offset;
 914         } else if (cb->user_buffer) {
 915                 return panfrost_pool_upload_aligned(&batch->pool,
 916                                                  cb->user_buffer +
 917                                                  cb->buffer_offset,
 918                                                  cb->buffer_size, 16);
 919         } else {
 920                 unreachable("No constant buffer");
 921         }
 922 }
 923
 924 struct sysval_uniform {
 925         union {
 926                 float f[4];
 927                 int32_t i[4];
 928                 uint32_t u[4];
 929                 uint64_t du[2];
 930         };
 931 };
 932
 933 static void
 934 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 935                                       struct sysval_uniform *uniform)
 936 {
 937         struct panfrost_context *ctx = batch->ctx;
 938         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 939
 940         uniform->f[0] = vp->scale[0];
 941         uniform->f[1] = vp->scale[1];
 942         uniform->f[2] = vp->scale[2];
 943 }
 944
 945 static void
 946 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 947                                        struct sysval_uniform *uniform)
 948 {
 949         struct panfrost_context *ctx = batch->ctx;
 950         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 951
 952         uniform->f[0] = vp->translate[0];
 953         uniform->f[1] = vp->translate[1];
 954         uniform->f[2] = vp->translate[2];
 955 }
 956
 957 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 958                                        enum pipe_shader_type st,
 959                                        unsigned int sysvalid,
 960                                        struct sysval_uniform *uniform)
 961 {
 962         struct panfrost_context *ctx = batch->ctx;
 963         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 964         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 965         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 966         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 967
 968         assert(dim);
 969         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 970
 971         if (dim > 1)
 972                 uniform->i[1] = u_minify(tex->texture->height0,
 973                                          tex->u.tex.first_level);
 974
 975         if (dim > 2)
 976                 uniform->i[2] = u_minify(tex->texture->depth0,
 977                                          tex->u.tex.first_level);
 978
 979         if (is_array)
 980                 uniform->i[dim] = tex->texture->array_size;
 981 }
 982
 983 static void
 984 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 985                             enum pipe_shader_type st,
 986                             unsigned ssbo_id,
 987                             struct sysval_uniform *uniform)
 988 {
 989         struct panfrost_context *ctx = batch->ctx;
 990
 991         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 992         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 993
 994         /* Compute address */
 995         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 996
 997         panfrost_batch_add_bo(batch, bo,
 998                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 999                               panfrost_bo_access_for_stage(st));
1000
1001         /* Upload address and size as sysval */
1002         uniform->du[0] = bo->gpu + sb.buffer_offset;
1003         uniform->u[2] = sb.buffer_size;
1004 }
1005
1006 static void
1007 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1008                                enum pipe_shader_type st,
1009                                unsigned samp_idx,
1010                                struct sysval_uniform *uniform)
1011 {
1012         struct panfrost_context *ctx = batch->ctx;
1013         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1014
1015         uniform->f[0] = sampl->min_lod;
1016         uniform->f[1] = sampl->max_lod;
1017         uniform->f[2] = sampl->lod_bias;
1018
1019         /* Even without any errata, Midgard represents "no mipmapping" as
1020          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1021          * panfrost_create_sampler_state which also explains our choice of
1022          * epsilon value (again to keep behaviour consistent) */
1023
1024         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1025                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1026 }
1027
1028 static void
1029 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1030                                        struct sysval_uniform *uniform)
1031 {
1032         struct panfrost_context *ctx = batch->ctx;
1033
1034         uniform->u[0] = ctx->compute_grid->grid[0];
1035         uniform->u[1] = ctx->compute_grid->grid[1];
1036         uniform->u[2] = ctx->compute_grid->grid[2];
1037 }
1038
1039 static void
1040 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1041                         struct panfrost_shader_state *ss,
1042                         enum pipe_shader_type st)
1043 {
1044         struct sysval_uniform *uniforms = (void *)buf;
1045
1046         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1047                 int sysval = ss->sysval[i];
1048
1049                 switch (PAN_SYSVAL_TYPE(sysval)) {
1050                 case PAN_SYSVAL_VIEWPORT_SCALE:
1051                         panfrost_upload_viewport_scale_sysval(batch,
1052                                                               &uniforms[i]);
1053                         break;
1054                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1055                         panfrost_upload_viewport_offset_sysval(batch,
1056                                                                &uniforms[i]);
1057                         break;
1058                 case PAN_SYSVAL_TEXTURE_SIZE:
1059                         panfrost_upload_txs_sysval(batch, st,
1060                                                    PAN_SYSVAL_ID(sysval),
1061                                                    &uniforms[i]);
1062                         break;
1063                 case PAN_SYSVAL_SSBO:
1064                         panfrost_upload_ssbo_sysval(batch, st,
1065                                                     PAN_SYSVAL_ID(sysval),
1066                                                     &uniforms[i]);
1067                         break;
1068                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1069                         panfrost_upload_num_work_groups_sysval(batch,
1070                                                                &uniforms[i]);
1071                         break;
1072                 case PAN_SYSVAL_SAMPLER:
1073                         panfrost_upload_sampler_sysval(batch, st,
1074                                                        PAN_SYSVAL_ID(sysval),
1075                                                        &uniforms[i]);
1076                         break;
1077                 default:
1078                         assert(0);
1079                 }
1080         }
1081 }
1082
1083 static const void *
1084 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1085                                  unsigned index)
1086 {
1087         struct pipe_constant_buffer *cb = &buf->cb[index];
1088         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1089
1090         if (rsrc)
1091                 return rsrc->bo->cpu;
1092         else if (cb->user_buffer)
1093                 return cb->user_buffer;
1094         else
1095                 unreachable("No constant buffer");
1096 }
1097
1098 void
1099 panfrost_emit_const_buf(struct panfrost_batch *batch,
1100                         enum pipe_shader_type stage,
1101                         struct mali_vertex_tiler_postfix *postfix)
1102 {
1103         struct panfrost_context *ctx = batch->ctx;
1104         struct panfrost_shader_variants *all = ctx->shader[stage];
1105
1106         if (!all)
1107                 return;
1108
1109         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1110
1111         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1112
1113         /* Uniforms are implicitly UBO #0 */
1114         bool has_uniforms = buf->enabled_mask & (1 << 0);
1115
1116         /* Allocate room for the sysval and the uniforms */
1117         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1118         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1119         size_t size = sys_size + uniform_size;
1120         struct panfrost_transfer transfer =
1121                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1122
1123         /* Upload sysvals requested by the shader */
1124         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1125
1126         /* Upload uniforms */
1127         if (has_uniforms && uniform_size) {
1128                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1129                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1130         }
1131
1132         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1133          * uploaded */
1134
1135         unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1136         assert(ubo_count >= 1);
1137
1138         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1139         struct panfrost_transfer ubos =
1140                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1141                                 MALI_UNIFORM_BUFFER_LENGTH);
1142
1143         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1144
1145         /* Upload uniforms as a UBO */
1146
1147         if (size) {
1148                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1149                         cfg.entries = DIV_ROUND_UP(size, 16);
1150                         cfg.pointer = transfer.gpu;
1151                 }
1152         } else {
1153                 *ubo_ptr = 0;
1154         }
1155
1156         /* The rest are honest-to-goodness UBOs */
1157
1158         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1159                 size_t usz = buf->cb[ubo].buffer_size;
1160                 bool enabled = buf->enabled_mask & (1 << ubo);
1161                 bool empty = usz == 0;
1162
1163                 if (!enabled || empty) {
1164                         ubo_ptr[ubo] = 0;
1165                         continue;
1166                 }
1167
1168                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1169                         cfg.entries = DIV_ROUND_UP(usz, 16);
1170                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1171                                         stage, buf, ubo);
1172                 }
1173         }
1174
1175         postfix->uniforms = transfer.gpu;
1176         postfix->uniform_buffers = ubos.gpu;
1177
1178         buf->dirty_mask = 0;
1179 }
1180
1181 void
1182 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1183                             const struct pipe_grid_info *info,
1184                             struct midgard_payload_vertex_tiler *vtp)
1185 {
1186         struct panfrost_context *ctx = batch->ctx;
1187         struct panfrost_device *dev = pan_device(ctx->base.screen);
1188         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1189         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1190         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1191                                                            128));
1192
1193         unsigned log2_instances =
1194                 util_logbase2_ceil(info->grid[0]) +
1195                 util_logbase2_ceil(info->grid[1]) +
1196                 util_logbase2_ceil(info->grid[2]);
1197
1198         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1199         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1200                                                                   shared_size,
1201                                                                   1);
1202
1203         struct mali_shared_memory shared = {
1204                 .shared_memory = bo->gpu,
1205                 .shared_workgroup_count = log2_instances,
1206                 .shared_shift = util_logbase2(single_size) + 1
1207         };
1208
1209         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1210                                                                sizeof(shared), 64);
1211 }
1212
1213 static mali_ptr
1214 panfrost_get_tex_desc(struct panfrost_batch *batch,
1215                       enum pipe_shader_type st,
1216                       struct panfrost_sampler_view *view)
1217 {
1218         if (!view)
1219                 return (mali_ptr) 0;
1220
1221         struct pipe_sampler_view *pview = &view->base;
1222         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1223
1224         /* Add the BO to the job so it's retained until the job is done. */
1225
1226         panfrost_batch_add_bo(batch, rsrc->bo,
1227                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1228                               panfrost_bo_access_for_stage(st));
1229
1230         panfrost_batch_add_bo(batch, view->bo,
1231                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1232                               panfrost_bo_access_for_stage(st));
1233
1234         return view->bo->gpu;
1235 }
1236
1237 static void
1238 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1239                              struct pipe_context *pctx)
1240 {
1241         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1242         if (view->texture_bo != rsrc->bo->gpu ||
1243             view->modifier != rsrc->modifier) {
1244                 panfrost_bo_unreference(view->bo);
1245                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1246         }
1247 }
1248
1249 void
1250 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1251                                   enum pipe_shader_type stage,
1252                                   struct mali_vertex_tiler_postfix *postfix)
1253 {
1254         struct panfrost_context *ctx = batch->ctx;
1255         struct panfrost_device *device = pan_device(ctx->base.screen);
1256
1257         if (!ctx->sampler_view_count[stage])
1258                 return;
1259
1260         if (device->quirks & IS_BIFROST) {
1261                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1262                                 MALI_BIFROST_TEXTURE_LENGTH *
1263                                 ctx->sampler_view_count[stage],
1264                                 MALI_BIFROST_TEXTURE_LENGTH);
1265
1266                 struct mali_bifrost_texture_packed *out =
1267                         (struct mali_bifrost_texture_packed *) T.cpu;
1268
1269                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1270                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1271                         struct pipe_sampler_view *pview = &view->base;
1272                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1273
1274                         panfrost_update_sampler_view(view, &ctx->base);
1275                         out[i] = view->bifrost_descriptor;
1276
1277                         /* Add the BOs to the job so they are retained until the job is done. */
1278
1279                         panfrost_batch_add_bo(batch, rsrc->bo,
1280                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1281                                               panfrost_bo_access_for_stage(stage));
1282
1283                         panfrost_batch_add_bo(batch, view->bo,
1284                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1285                                               panfrost_bo_access_for_stage(stage));
1286                 }
1287
1288                 postfix->textures = T.gpu;
1289         } else {
1290                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1291
1292                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1293                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1294
1295                         panfrost_update_sampler_view(view, &ctx->base);
1296
1297                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1298                 }
1299
1300                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1301                                                               trampolines,
1302                                                               sizeof(uint64_t) *
1303                                                               ctx->sampler_view_count[stage],
1304                                                               sizeof(uint64_t));
1305         }
1306 }
1307
1308 void
1309 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1310                                   enum pipe_shader_type stage,
1311                                   struct mali_vertex_tiler_postfix *postfix)
1312 {
1313         struct panfrost_context *ctx = batch->ctx;
1314
1315         if (!ctx->sampler_count[stage])
1316                 return;
1317
1318         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1319         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1320
1321         size_t sz = desc_size * ctx->sampler_count[stage];
1322         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1323         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1324
1325         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1326                 out[i] = ctx->samplers[stage][i]->hw;
1327
1328         postfix->sampler_descriptor = T.gpu;
1329 }
1330
1331 void
1332 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1333                           struct mali_vertex_tiler_postfix *vertex_postfix)
1334 {
1335         struct panfrost_context *ctx = batch->ctx;
1336         struct panfrost_vertex_state *so = ctx->vertex;
1337         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1338
1339         unsigned instance_shift = vertex_postfix->instance_shift;
1340         unsigned instance_odd = vertex_postfix->instance_odd;
1341
1342         /* Worst case: everything is NPOT, which is only possible if instancing
1343          * is enabled. Otherwise single record is gauranteed */
1344         bool could_npot = instance_shift || instance_odd;
1345
1346         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1347                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1348                         (could_npot ? 2 : 1),
1349                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1350
1351         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1352                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1353                         MALI_ATTRIBUTE_LENGTH);
1354
1355         struct mali_attribute_buffer_packed *bufs =
1356                 (struct mali_attribute_buffer_packed *) S.cpu;
1357
1358         struct mali_attribute_packed *out =
1359                 (struct mali_attribute_packed *) T.cpu;
1360
1361         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1362         unsigned k = 0;
1363
1364         for (unsigned i = 0; i < so->num_elements; ++i) {
1365                 /* We map buffers 1:1 with the attributes, which
1366                  * means duplicating some vertex buffers (who cares? aside from
1367                  * maybe some caching implications but I somehow doubt that
1368                  * matters) */
1369
1370                 struct pipe_vertex_element *elem = &so->pipe[i];
1371                 unsigned vbi = elem->vertex_buffer_index;
1372                 attrib_to_buffer[i] = k;
1373
1374                 if (!(ctx->vb_mask & (1 << vbi)))
1375                         continue;
1376
1377                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1378                 struct panfrost_resource *rsrc;
1379
1380                 rsrc = pan_resource(buf->buffer.resource);
1381                 if (!rsrc)
1382                         continue;
1383
1384                 /* Add a dependency of the batch on the vertex buffer */
1385                 panfrost_batch_add_bo(batch, rsrc->bo,
1386                                       PAN_BO_ACCESS_SHARED |
1387                                       PAN_BO_ACCESS_READ |
1388                                       PAN_BO_ACCESS_VERTEX_TILER);
1389
1390                 /* Mask off lower bits, see offset fixup below */
1391                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1392                 mali_ptr addr = raw_addr & ~63;
1393
1394                 /* Since we advanced the base pointer, we shrink the buffer
1395                  * size, but add the offset we subtracted */
1396                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1397                         - buf->buffer_offset;
1398
1399                 /* When there is a divisor, the hardware-level divisor is
1400                  * the product of the instance divisor and the padded count */
1401                 unsigned divisor = elem->instance_divisor;
1402                 unsigned hw_divisor = ctx->padded_count * divisor;
1403                 unsigned stride = buf->stride;
1404
1405                 /* If there's a divisor(=1) but no instancing, we want every
1406                  * attribute to be the same */
1407
1408                 if (divisor && ctx->instance_count == 1)
1409                         stride = 0;
1410
1411                 if (!divisor || ctx->instance_count <= 1) {
1412                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1413                                 if (ctx->instance_count > 1)
1414                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1415
1416                                 cfg.pointer = addr;
1417                                 cfg.stride = stride;
1418                                 cfg.size = size;
1419                                 cfg.divisor_r = instance_shift;
1420                                 cfg.divisor_p = instance_odd;
1421                         }
1422                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1423                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1424                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1425                                 cfg.pointer = addr;
1426                                 cfg.stride = stride;
1427                                 cfg.size = size;
1428                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1429                         }
1430
1431                 } else {
1432                         unsigned shift = 0, extra_flags = 0;
1433
1434                         unsigned magic_divisor =
1435                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1436
1437                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1438                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1439                                 cfg.pointer = addr;
1440                                 cfg.stride = stride;
1441                                 cfg.size = size;
1442
1443                                 cfg.divisor_r = shift;
1444                                 cfg.divisor_e = extra_flags;
1445                         }
1446
1447                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1448                                 cfg.divisor_numerator = magic_divisor;
1449                                 cfg.divisor = divisor;
1450                         }
1451
1452                         ++k;
1453                 }
1454
1455                 ++k;
1456         }
1457
1458         /* Add special gl_VertexID/gl_InstanceID buffers */
1459
1460         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1461                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1462
1463                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1464                         cfg.buffer_index = k++;
1465                         cfg.format = so->formats[PAN_VERTEX_ID];
1466                 }
1467
1468                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1469
1470                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1471                         cfg.buffer_index = k++;
1472                         cfg.format = so->formats[PAN_INSTANCE_ID];
1473                 }
1474         }
1475
1476         /* Attribute addresses require 64-byte alignment, so let:
1477          *
1478          *      base' = base & ~63 = base - (base & 63)
1479          *      offset' = offset + (base & 63)
1480          *
1481          * Since base' + offset' = base + offset, these are equivalent
1482          * addressing modes and now base is 64 aligned.
1483          */
1484
1485         unsigned start = vertex_postfix->offset_start;
1486
1487         for (unsigned i = 0; i < so->num_elements; ++i) {
1488                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1489                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1490
1491                 /* Adjust by the masked off bits of the offset. Make sure we
1492                  * read src_offset from so->hw (which is not GPU visible)
1493                  * rather than target (which is) due to caching effects */
1494
1495                 unsigned src_offset = so->pipe[i].src_offset;
1496
1497                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1498                 src_offset += (buf->buffer_offset & 63);
1499
1500                 /* Also, somewhat obscurely per-instance data needs to be
1501                  * offset in response to a delayed start in an indexed draw */
1502
1503                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1504                         src_offset -= buf->stride * start;
1505
1506                 pan_pack(out + i, ATTRIBUTE, cfg) {
1507                         cfg.buffer_index = attrib_to_buffer[i];
1508                         cfg.format = so->formats[i];
1509                         cfg.offset = src_offset;
1510                 }
1511         }
1512
1513         vertex_postfix->attributes = S.gpu;
1514         vertex_postfix->attribute_meta = T.gpu;
1515 }
1516
1517 static mali_ptr
1518 panfrost_emit_varyings(struct panfrost_batch *batch,
1519                 struct mali_attribute_buffer_packed *slot,
1520                 unsigned stride, unsigned count)
1521 {
1522         unsigned size = stride * count;
1523         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1524
1525         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1526                 cfg.stride = stride;
1527                 cfg.size = size;
1528                 cfg.pointer = ptr;
1529         }
1530
1531         return ptr;
1532 }
1533
1534 static unsigned
1535 panfrost_streamout_offset(unsigned stride, unsigned offset,
1536                         struct pipe_stream_output_target *target)
1537 {
1538         return (target->buffer_offset + (offset * stride * 4)) & 63;
1539 }
1540
1541 static void
1542 panfrost_emit_streamout(struct panfrost_batch *batch,
1543                         struct mali_attribute_buffer_packed *slot,
1544                         unsigned stride_words, unsigned offset, unsigned count,
1545                         struct pipe_stream_output_target *target)
1546 {
1547         unsigned stride = stride_words * 4;
1548         unsigned max_size = target->buffer_size;
1549         unsigned expected_size = stride * count;
1550
1551         /* Grab the BO and bind it to the batch */
1552         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1553
1554         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1555          * the perspective of the TILER and FRAGMENT.
1556          */
1557         panfrost_batch_add_bo(batch, bo,
1558                               PAN_BO_ACCESS_SHARED |
1559                               PAN_BO_ACCESS_RW |
1560                               PAN_BO_ACCESS_VERTEX_TILER |
1561                               PAN_BO_ACCESS_FRAGMENT);
1562
1563         /* We will have an offset applied to get alignment */
1564         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1565
1566         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1567                 cfg.pointer = (addr & ~63);
1568                 cfg.stride = stride;
1569                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1570         }
1571 }
1572
1573 static bool
1574 has_point_coord(unsigned mask, gl_varying_slot loc)
1575 {
1576         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1577                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1578         else if (loc == VARYING_SLOT_PNTC)
1579                 return (mask & (1 << 8));
1580         else
1581                 return false;
1582 }
1583
1584 /* Helpers for manipulating stream out information so we can pack varyings
1585  * accordingly. Compute the src_offset for a given captured varying */
1586
1587 static struct pipe_stream_output *
1588 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1589 {
1590         for (unsigned i = 0; i < info->num_outputs; ++i) {
1591                 if (info->output[i].register_index == loc)
1592                         return &info->output[i];
1593         }
1594
1595         unreachable("Varying not captured");
1596 }
1597
1598 static unsigned
1599 pan_varying_size(enum mali_format fmt)
1600 {
1601         unsigned type = MALI_EXTRACT_TYPE(fmt);
1602         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1603         unsigned bits = MALI_EXTRACT_BITS(fmt);
1604         unsigned bpc = 0;
1605
1606         if (bits == MALI_CHANNEL_FLOAT) {
1607                 /* No doubles */
1608                 bool fp16 = (type == MALI_FORMAT_SINT);
1609                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1610
1611                 bpc = fp16 ? 2 : 4;
1612         } else {
1613                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1614
1615                 /* See the enums */
1616                 bits = 1 << bits;
1617                 assert(bits >= 8);
1618                 bpc = bits / 8;
1619         }
1620
1621         return bpc * chan;
1622 }
1623
1624 /* Indices for named (non-XFB) varyings that are present. These are packed
1625  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1626  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1627  * of a given special field given a shift S by:
1628  *
1629  *      idx = popcount(P & ((1 << S) - 1))
1630  *
1631  * That is... look at all of the varyings that come earlier and count them, the
1632  * count is the new index since plus one. Likewise, the total number of special
1633  * buffers required is simply popcount(P)
1634  */
1635
1636 enum pan_special_varying {
1637         PAN_VARY_GENERAL = 0,
1638         PAN_VARY_POSITION = 1,
1639         PAN_VARY_PSIZ = 2,
1640         PAN_VARY_PNTCOORD = 3,
1641         PAN_VARY_FACE = 4,
1642         PAN_VARY_FRAGCOORD = 5,
1643
1644         /* Keep last */
1645         PAN_VARY_MAX,
1646 };
1647
1648 /* Given a varying, figure out which index it correpsonds to */
1649
1650 static inline unsigned
1651 pan_varying_index(unsigned present, enum pan_special_varying v)
1652 {
1653         unsigned mask = (1 << v) - 1;
1654         return util_bitcount(present & mask);
1655 }
1656
1657 /* Get the base offset for XFB buffers, which by convention come after
1658  * everything else. Wrapper function for semantic reasons; by construction this
1659  * is just popcount. */
1660
1661 static inline unsigned
1662 pan_xfb_base(unsigned present)
1663 {
1664         return util_bitcount(present);
1665 }
1666
1667 /* Computes the present mask for varyings so we can start emitting varying records */
1668
1669 static inline unsigned
1670 pan_varying_present(
1671         struct panfrost_shader_state *vs,
1672         struct panfrost_shader_state *fs,
1673         unsigned quirks)
1674 {
1675         /* At the moment we always emit general and position buffers. Not
1676          * strictly necessary but usually harmless */
1677
1678         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1679
1680         /* Enable special buffers by the shader info */
1681
1682         if (vs->writes_point_size)
1683                 present |= (1 << PAN_VARY_PSIZ);
1684
1685         if (fs->reads_point_coord)
1686                 present |= (1 << PAN_VARY_PNTCOORD);
1687
1688         if (fs->reads_face)
1689                 present |= (1 << PAN_VARY_FACE);
1690
1691         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1692                 present |= (1 << PAN_VARY_FRAGCOORD);
1693
1694         /* Also, if we have a point sprite, we need a point coord buffer */
1695
1696         for (unsigned i = 0; i < fs->varying_count; i++)  {
1697                 gl_varying_slot loc = fs->varyings_loc[i];
1698
1699                 if (has_point_coord(fs->point_sprite_mask, loc))
1700                         present |= (1 << PAN_VARY_PNTCOORD);
1701         }
1702
1703         return present;
1704 }
1705
1706 /* Emitters for varying records */
1707
1708 static void
1709 pan_emit_vary(struct mali_attribute_packed *out,
1710                 unsigned present, enum pan_special_varying buf,
1711                 unsigned quirks, enum mali_format format,
1712                 unsigned offset)
1713 {
1714         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1715         unsigned swizzle = quirks & HAS_SWIZZLES ?
1716                         panfrost_get_default_swizzle(nr_channels) :
1717                         panfrost_bifrost_swizzle(nr_channels);
1718
1719         pan_pack(out, ATTRIBUTE, cfg) {
1720                 cfg.buffer_index = pan_varying_index(present, buf);
1721                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1722                 cfg.format = (format << 12) | swizzle;
1723                 cfg.offset = offset;
1724         }
1725 }
1726
1727 /* General varying that is unused */
1728
1729 static void
1730 pan_emit_vary_only(struct mali_attribute_packed *out,
1731                 unsigned present, unsigned quirks)
1732 {
1733         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1734 }
1735
1736 /* Special records */
1737
1738 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1739         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1740         [PAN_VARY_PSIZ]         = MALI_R16F,
1741         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1742         [PAN_VARY_FACE]         = MALI_R32I,
1743         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1744 };
1745
1746 static void
1747 pan_emit_vary_special(struct mali_attribute_packed *out,
1748                 unsigned present, enum pan_special_varying buf,
1749                 unsigned quirks)
1750 {
1751         assert(buf < PAN_VARY_MAX);
1752         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1753 }
1754
1755 static enum mali_format
1756 pan_xfb_format(enum mali_format format, unsigned nr)
1757 {
1758         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1759                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1760         else
1761                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1762 }
1763
1764 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1765  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1766  * value. */
1767
1768 static void
1769 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1770                 unsigned present,
1771                 unsigned max_xfb,
1772                 unsigned *streamout_offsets,
1773                 unsigned quirks,
1774                 enum mali_format format,
1775                 struct pipe_stream_output o)
1776 {
1777         unsigned swizzle = quirks & HAS_SWIZZLES ?
1778                         panfrost_get_default_swizzle(o.num_components) :
1779                         panfrost_bifrost_swizzle(o.num_components);
1780
1781         pan_pack(out, ATTRIBUTE, cfg) {
1782                 /* XFB buffers come after everything else */
1783                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1784                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1785
1786                 /* Override number of channels and precision to highp */
1787                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1788
1789                 /* Apply given offsets together */
1790                 cfg.offset = (o.dst_offset * 4) /* dwords */
1791                         + streamout_offsets[o.output_buffer];
1792         }
1793 }
1794
1795 /* Determine if we should capture a varying for XFB. This requires actually
1796  * having a buffer for it. If we don't capture it, we'll fallback to a general
1797  * varying path (linked or unlinked, possibly discarding the write) */
1798
1799 static bool
1800 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1801                 unsigned loc, unsigned max_xfb)
1802 {
1803         if (!(xfb->so_mask & (1ll << loc)))
1804                 return false;
1805
1806         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1807         return o->output_buffer < max_xfb;
1808 }
1809
1810 static void
1811 pan_emit_general_varying(struct mali_attribute_packed *out,
1812                 struct panfrost_shader_state *other,
1813                 struct panfrost_shader_state *xfb,
1814                 gl_varying_slot loc,
1815                 enum mali_format format,
1816                 unsigned present,
1817                 unsigned quirks,
1818                 unsigned *gen_offsets,
1819                 enum mali_format *gen_formats,
1820                 unsigned *gen_stride,
1821                 unsigned idx,
1822                 bool should_alloc)
1823 {
1824         /* Check if we're linked */
1825         signed other_idx = -1;
1826
1827         for (unsigned j = 0; j < other->varying_count; ++j) {
1828                 if (other->varyings_loc[j] == loc) {
1829                         other_idx = j;
1830                         break;
1831                 }
1832         }
1833
1834         if (other_idx < 0) {
1835                 pan_emit_vary_only(out, present, quirks);
1836                 return;
1837         }
1838
1839         unsigned offset = gen_offsets[other_idx];
1840
1841         if (should_alloc) {
1842                 /* We're linked, so allocate a space via a watermark allocation */
1843                 enum mali_format alt = other->varyings[other_idx];
1844
1845                 /* Do interpolation at minimum precision */
1846                 unsigned size_main = pan_varying_size(format);
1847                 unsigned size_alt = pan_varying_size(alt);
1848                 unsigned size = MIN2(size_main, size_alt);
1849
1850                 /* If a varying is marked for XFB but not actually captured, we
1851                  * should match the format to the format that would otherwise
1852                  * be used for XFB, since dEQP checks for invariance here. It's
1853                  * unclear if this is required by the spec. */
1854
1855                 if (xfb->so_mask & (1ull << loc)) {
1856                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1857                         format = pan_xfb_format(format, o->num_components);
1858                         size = pan_varying_size(format);
1859                 } else if (size == size_alt) {
1860                         format = alt;
1861                 }
1862
1863                 gen_offsets[idx] = *gen_stride;
1864                 gen_formats[other_idx] = format;
1865                 offset = *gen_stride;
1866                 *gen_stride += size;
1867         }
1868
1869         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1870 }
1871
1872 /* Higher-level wrapper around all of the above, classifying a varying into one
1873  * of the above types */
1874
1875 static void
1876 panfrost_emit_varying(
1877                 struct mali_attribute_packed *out,
1878                 struct panfrost_shader_state *stage,
1879                 struct panfrost_shader_state *other,
1880                 struct panfrost_shader_state *xfb,
1881                 unsigned present,
1882                 unsigned max_xfb,
1883                 unsigned *streamout_offsets,
1884                 unsigned quirks,
1885                 unsigned *gen_offsets,
1886                 enum mali_format *gen_formats,
1887                 unsigned *gen_stride,
1888                 unsigned idx,
1889                 bool should_alloc,
1890                 bool is_fragment)
1891 {
1892         gl_varying_slot loc = stage->varyings_loc[idx];
1893         enum mali_format format = stage->varyings[idx];
1894
1895         /* Override format to match linkage */
1896         if (!should_alloc && gen_formats[idx])
1897                 format = gen_formats[idx];
1898
1899         if (has_point_coord(stage->point_sprite_mask, loc)) {
1900                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1901         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1902                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1903                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1904         } else if (loc == VARYING_SLOT_POS) {
1905                 if (is_fragment)
1906                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1907                 else
1908                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1909         } else if (loc == VARYING_SLOT_PSIZ) {
1910                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1911         } else if (loc == VARYING_SLOT_PNTC) {
1912                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1913         } else if (loc == VARYING_SLOT_FACE) {
1914                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1915         } else {
1916                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1917                                 quirks, gen_offsets, gen_formats, gen_stride,
1918                                 idx, should_alloc);
1919         }
1920 }
1921
1922 static void
1923 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1924                 unsigned present,
1925                 enum pan_special_varying v,
1926                 unsigned special)
1927 {
1928         if (present & (1 << v)) {
1929                 unsigned idx = pan_varying_index(present, v);
1930
1931                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1932                         cfg.special = special;
1933                         cfg.type = 0;
1934                 }
1935         }
1936 }
1937
1938 void
1939 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1940                                  unsigned vertex_count,
1941                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1942                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1943                                  union midgard_primitive_size *primitive_size)
1944 {
1945         /* Load the shaders */
1946         struct panfrost_context *ctx = batch->ctx;
1947         struct panfrost_device *dev = pan_device(ctx->base.screen);
1948         struct panfrost_shader_state *vs, *fs;
1949         size_t vs_size, fs_size;
1950
1951         /* Allocate the varying descriptor */
1952
1953         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1954         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1955         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1956         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1957
1958         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1959                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1960
1961         struct pipe_stream_output_info *so = &vs->stream_output;
1962         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1963
1964         /* Check if this varying is linked by us. This is the case for
1965          * general-purpose, non-captured varyings. If it is, link it. If it's
1966          * not, use the provided stream out information to determine the
1967          * offset, since it was already linked for us. */
1968
1969         unsigned gen_offsets[32];
1970         enum mali_format gen_formats[32];
1971         memset(gen_offsets, 0, sizeof(gen_offsets));
1972         memset(gen_formats, 0, sizeof(gen_formats));
1973
1974         unsigned gen_stride = 0;
1975         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1976         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1977
1978         unsigned streamout_offsets[32];
1979
1980         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1981                 streamout_offsets[i] = panfrost_streamout_offset(
1982                                         so->stride[i],
1983                                         ctx->streamout.offsets[i],
1984                                         ctx->streamout.targets[i]);
1985         }
1986
1987         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1988         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1989
1990         for (unsigned i = 0; i < vs->varying_count; i++) {
1991                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1992                                 ctx->streamout.num_targets, streamout_offsets,
1993                                 dev->quirks,
1994                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1995         }
1996
1997         for (unsigned i = 0; i < fs->varying_count; i++) {
1998                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1999                                 ctx->streamout.num_targets, streamout_offsets,
2000                                 dev->quirks,
2001                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
2002         }
2003
2004         unsigned xfb_base = pan_xfb_base(present);
2005         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
2006                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
2007                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
2008         struct mali_attribute_buffer_packed *varyings =
2009                 (struct mali_attribute_buffer_packed *) T.cpu;
2010
2011         /* Emit the stream out buffers */
2012
2013         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2014                                                            ctx->vertex_count);
2015
2016         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2017                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2018                                         so->stride[i],
2019                                         ctx->streamout.offsets[i],
2020                                         out_count,
2021                                         ctx->streamout.targets[i]);
2022         }
2023
2024         panfrost_emit_varyings(batch,
2025                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2026                         gen_stride, vertex_count);
2027
2028         /* fp32 vec4 gl_Position */
2029         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2030                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2031                         sizeof(float) * 4, vertex_count);
2032
2033         if (present & (1 << PAN_VARY_PSIZ)) {
2034                 primitive_size->pointer = panfrost_emit_varyings(batch,
2035                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2036                                 2, vertex_count);
2037         }
2038
2039         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2040         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2041         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2042
2043         vertex_postfix->varyings = T.gpu;
2044         tiler_postfix->varyings = T.gpu;
2045
2046         vertex_postfix->varying_meta = trans.gpu;
2047         tiler_postfix->varying_meta = trans.gpu + vs_size;
2048 }
2049
2050 void
2051 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2052                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2053                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2054                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2055                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2056                                 union midgard_primitive_size *primitive_size)
2057 {
2058         struct panfrost_context *ctx = batch->ctx;
2059         struct panfrost_device *device = pan_device(ctx->base.screen);
2060         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2061         struct bifrost_payload_vertex bifrost_vertex = {0,};
2062         struct bifrost_payload_tiler bifrost_tiler = {0,};
2063         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2064         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2065         void *vp, *tp;
2066         size_t vp_size, tp_size;
2067
2068         if (device->quirks & IS_BIFROST) {
2069                 bifrost_vertex.prefix = *vertex_prefix;
2070                 bifrost_vertex.postfix = *vertex_postfix;
2071                 vp = &bifrost_vertex;
2072                 vp_size = sizeof(bifrost_vertex);
2073
2074                 bifrost_tiler.prefix = *tiler_prefix;
2075                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2076                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2077                 bifrost_tiler.postfix = *tiler_postfix;
2078                 tp = &bifrost_tiler;
2079                 tp_size = sizeof(bifrost_tiler);
2080         } else {
2081                 midgard_vertex.prefix = *vertex_prefix;
2082                 midgard_vertex.postfix = *vertex_postfix;
2083                 vp = &midgard_vertex;
2084                 vp_size = sizeof(midgard_vertex);
2085
2086                 midgard_tiler.prefix = *tiler_prefix;
2087                 midgard_tiler.postfix = *tiler_postfix;
2088                 midgard_tiler.primitive_size = *primitive_size;
2089                 tp = &midgard_tiler;
2090                 tp_size = sizeof(midgard_tiler);
2091         }
2092
2093         if (wallpapering) {
2094                 /* Inject in reverse order, with "predicted" job indices.
2095                  * THIS IS A HACK XXX */
2096                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2097                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2098                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2099                                  vp, vp_size, true);
2100                 return;
2101         }
2102
2103         /* If rasterizer discard is enable, only submit the vertex */
2104
2105         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2106                                            vp, vp_size, false);
2107
2108         if (ctx->rasterizer->base.rasterizer_discard)
2109                 return;
2110
2111         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2112                          false);
2113 }
2114
2115 /* TODO: stop hardcoding this */
2116 mali_ptr
2117 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2118 {
2119         uint16_t locations[] = {
2120             128, 128,
2121             0, 256,
2122             0, 256,
2123             0, 256,
2124             0, 256,
2125             0, 256,
2126             0, 256,
2127             0, 256,
2128             0, 256,
2129             0, 256,
2130             0, 256,
2131             0, 256,
2132             0, 256,
2133             0, 256,
2134             0, 256,
2135             0, 256,
2136             0, 256,
2137             0, 256,
2138             0, 256,
2139             0, 256,
2140             0, 256,
2141             0, 256,
2142             0, 256,
2143             0, 256,
2144             0, 256,
2145             0, 256,
2146             0, 256,
2147             0, 256,
2148             0, 256,
2149             0, 256,
2150             0, 256,
2151             0, 256,
2152             128, 128,
2153             0, 0,
2154             0, 0,
2155             0, 0,
2156             0, 0,
2157             0, 0,
2158             0, 0,
2159             0, 0,
2160             0, 0,
2161             0, 0,
2162             0, 0,
2163             0, 0,
2164             0, 0,
2165             0, 0,
2166             0, 0,
2167             0, 0,
2168         };
2169
2170         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2171 }