src/gallium/drivers/panfrost/pan_cmdstream.c

   1 /*
   2  * Copyright (C) 2018 Alyssa Rosenzweig
   3  * Copyright (C) 2020 Collabora Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "util/macros.h"
  26 #include "util/u_prim.h"
  27 #include "util/u_vbuf.h"
  28
  29 #include "panfrost-quirks.h"
  30
  31 #include "pan_pool.h"
  32 #include "pan_bo.h"
  33 #include "pan_cmdstream.h"
  34 #include "pan_context.h"
  35 #include "pan_job.h"
  36
  37 /* If a BO is accessed for a particular shader stage, will it be in the primary
  38  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
  39  * fragment will be primary, e.g. compute jobs will be considered
  40  * "vertex/tiler" by analogy */
  41
  42 static inline uint32_t
  43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
  44 {
  45         assert(stage == PIPE_SHADER_FRAGMENT ||
  46                stage == PIPE_SHADER_VERTEX ||
  47                stage == PIPE_SHADER_COMPUTE);
  48
  49         return stage == PIPE_SHADER_FRAGMENT ?
  50                PAN_BO_ACCESS_FRAGMENT :
  51                PAN_BO_ACCESS_VERTEX_TILER;
  52 }
  53
  54 static void
  55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
  56                                struct mali_vertex_tiler_postfix *postfix)
  57 {
  58         struct panfrost_device *dev = pan_device(ctx->base.screen);
  59         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  60
  61         struct mali_shared_memory shared = {
  62                 .shared_workgroup_count = ~0,
  63         };
  64
  65         if (batch->stack_size) {
  66                 struct panfrost_bo *stack =
  67                         panfrost_batch_get_scratchpad(batch, batch->stack_size,
  68                                         dev->thread_tls_alloc,
  69                                         dev->core_count);
  70
  71                 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
  72                 shared.scratchpad = stack->gpu;
  73         }
  74
  75         postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
  76 }
  77
  78 static void
  79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
  80                                struct mali_vertex_tiler_postfix *postfix)
  81 {
  82         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
  83         postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
  84 }
  85
  86 static void
  87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
  88                               struct mali_vertex_tiler_prefix *prefix,
  89                               struct mali_vertex_tiler_postfix *postfix)
  90 {
  91         postfix->gl_enables |= 0x7;
  92         SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
  93                 rasterizer->base.front_ccw);
  94         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
  95                 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
  96         SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
  97                 (rasterizer->base.cull_face & PIPE_FACE_BACK));
  98         SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
  99                 rasterizer->base.flatshade_first);
 100 }
 101
 102 void
 103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
 104                                   struct mali_vertex_tiler_prefix *prefix,
 105                                   union midgard_primitive_size *primitive_size)
 106 {
 107         struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
 108
 109         if (!panfrost_writes_point_size(ctx)) {
 110                 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
 111                               rasterizer->base.point_size :
 112                               rasterizer->base.line_width;
 113
 114                 primitive_size->constant = val;
 115         }
 116 }
 117
 118 static void
 119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
 120                                    struct mali_vertex_tiler_postfix *postfix)
 121 {
 122         SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
 123         if (ctx->occlusion_query) {
 124                 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
 125                 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
 126                                       PAN_BO_ACCESS_SHARED |
 127                                       PAN_BO_ACCESS_RW |
 128                                       PAN_BO_ACCESS_FRAGMENT);
 129         } else {
 130                 postfix->occlusion_counter = 0;
 131         }
 132 }
 133
 134 void
 135 panfrost_vt_init(struct panfrost_context *ctx,
 136                  enum pipe_shader_type stage,
 137                  struct mali_vertex_tiler_prefix *prefix,
 138                  struct mali_vertex_tiler_postfix *postfix)
 139 {
 140         struct panfrost_device *device = pan_device(ctx->base.screen);
 141
 142         if (!ctx->shader[stage])
 143                 return;
 144
 145         memset(prefix, 0, sizeof(*prefix));
 146         memset(postfix, 0, sizeof(*postfix));
 147
 148         if (device->quirks & IS_BIFROST) {
 149                 postfix->gl_enables = 0x2;
 150                 panfrost_vt_emit_shared_memory(ctx, postfix);
 151         } else {
 152                 postfix->gl_enables = 0x6;
 153                 panfrost_vt_attach_framebuffer(ctx, postfix);
 154         }
 155
 156         if (stage == PIPE_SHADER_FRAGMENT) {
 157                 panfrost_vt_update_occlusion_query(ctx, postfix);
 158                 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
 159         }
 160 }
 161
 162 static unsigned
 163 panfrost_translate_index_size(unsigned size)
 164 {
 165         switch (size) {
 166         case 1:
 167                 return MALI_DRAW_INDEXED_UINT8;
 168
 169         case 2:
 170                 return MALI_DRAW_INDEXED_UINT16;
 171
 172         case 4:
 173                 return MALI_DRAW_INDEXED_UINT32;
 174
 175         default:
 176                 unreachable("Invalid index size");
 177         }
 178 }
 179
 180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
 181  * good for the duration of the draw (transient), could last longer. Also get
 182  * the bounds on the index buffer for the range accessed by the draw. We do
 183  * these operations together because there are natural optimizations which
 184  * require them to be together. */
 185
 186 static mali_ptr
 187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
 188                                   const struct pipe_draw_info *info,
 189                                   unsigned *min_index, unsigned *max_index)
 190 {
 191         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
 192         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 193         off_t offset = info->start * info->index_size;
 194         bool needs_indices = true;
 195         mali_ptr out = 0;
 196
 197         if (info->max_index != ~0u) {
 198                 *min_index = info->min_index;
 199                 *max_index = info->max_index;
 200                 needs_indices = false;
 201         }
 202
 203         if (!info->has_user_indices) {
 204                 /* Only resources can be directly mapped */
 205                 panfrost_batch_add_bo(batch, rsrc->bo,
 206                                       PAN_BO_ACCESS_SHARED |
 207                                       PAN_BO_ACCESS_READ |
 208                                       PAN_BO_ACCESS_VERTEX_TILER);
 209                 out = rsrc->bo->gpu + offset;
 210
 211                 /* Check the cache */
 212                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
 213                                                            info->start,
 214                                                            info->count,
 215                                                            min_index,
 216                                                            max_index);
 217         } else {
 218                 /* Otherwise, we need to upload to transient memory */
 219                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
 220                 struct panfrost_transfer T =
 221                         panfrost_pool_alloc_aligned(&batch->pool,
 222                                 info->count * info->index_size,
 223                                 info->index_size);
 224
 225                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
 226                 out = T.gpu;
 227         }
 228
 229         if (needs_indices) {
 230                 /* Fallback */
 231                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
 232
 233                 if (!info->has_user_indices)
 234                         panfrost_minmax_cache_add(rsrc->index_cache,
 235                                                   info->start, info->count,
 236                                                   *min_index, *max_index);
 237         }
 238
 239         return out;
 240 }
 241
 242 void
 243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
 244                           const struct pipe_draw_info *info,
 245                           enum mali_draw_mode draw_mode,
 246                           struct mali_vertex_tiler_postfix *vertex_postfix,
 247                           struct mali_vertex_tiler_prefix *tiler_prefix,
 248                           struct mali_vertex_tiler_postfix *tiler_postfix,
 249                           unsigned *vertex_count,
 250                           unsigned *padded_count)
 251 {
 252         tiler_prefix->draw_mode = draw_mode;
 253
 254         unsigned draw_flags = 0;
 255
 256         if (panfrost_writes_point_size(ctx))
 257                 draw_flags |= MALI_DRAW_VARYING_SIZE;
 258
 259         if (info->primitive_restart)
 260                 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
 261
 262         /* These doesn't make much sense */
 263
 264         draw_flags |= 0x3000;
 265
 266         if (info->index_size) {
 267                 unsigned min_index = 0, max_index = 0;
 268
 269                 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
 270                                                                        info,
 271                                                                        &min_index,
 272                                                                        &max_index);
 273
 274                 /* Use the corresponding values */
 275                 *vertex_count = max_index - min_index + 1;
 276                 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
 277                 tiler_prefix->offset_bias_correction = -min_index;
 278                 tiler_prefix->index_count = MALI_POSITIVE(info->count);
 279                 draw_flags |= panfrost_translate_index_size(info->index_size);
 280         } else {
 281                 tiler_prefix->indices = 0;
 282                 *vertex_count = ctx->vertex_count;
 283                 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
 284                 tiler_prefix->offset_bias_correction = 0;
 285                 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
 286         }
 287
 288         tiler_prefix->unknown_draw = draw_flags;
 289
 290         /* Encode the padded vertex count */
 291
 292         if (info->instance_count > 1) {
 293                 *padded_count = panfrost_padded_vertex_count(*vertex_count);
 294
 295                 unsigned shift = __builtin_ctz(ctx->padded_count);
 296                 unsigned k = ctx->padded_count >> (shift + 1);
 297
 298                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
 299                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
 300         } else {
 301                 *padded_count = *vertex_count;
 302
 303                 /* Reset instancing state */
 304                 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
 305                 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
 306         }
 307 }
 308
 309 static void
 310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
 311                           enum pipe_shader_type st,
 312                           struct mali_shader_meta *meta)
 313 {
 314         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 315         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 316
 317         memset(meta, 0, sizeof(*meta));
 318         memcpy(&meta->shader, &ss->shader, sizeof(ss->shader));
 319         memcpy(&meta->midgard_props, &ss->properties, sizeof(ss->properties));
 320
 321         if (dev->quirks & IS_BIFROST)
 322                 memcpy(&meta->bifrost_preload, &ss->preload, sizeof(ss->preload));
 323 }
 324
 325 static unsigned
 326 translate_tex_wrap(enum pipe_tex_wrap w)
 327 {
 328         switch (w) {
 329         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
 330         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
 331         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
 332         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
 333         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
 334         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
 335         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
 336         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
 337         default: unreachable("Invalid wrap");
 338         }
 339 }
 340
 341 /* The hardware compares in the wrong order order, so we have to flip before
 342  * encoding. Yes, really. */
 343
 344 static enum mali_func
 345 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
 346 {
 347         if (!cso->compare_mode)
 348                 return MALI_FUNC_NEVER;
 349
 350         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
 351         return panfrost_flip_compare_func(f);
 352 }
 353
 354 static enum mali_mipmap_mode
 355 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
 356 {
 357         switch (f) {
 358         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
 359         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
 360         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
 361         default: unreachable("Invalid");
 362         }
 363 }
 364
 365 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
 366                                 struct mali_midgard_sampler_packed *hw)
 367 {
 368         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
 369                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
 370                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 371                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
 372                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
 373                 cfg.normalized_coordinates = cso->normalized_coords;
 374
 375                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 376
 377                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 378
 379                 /* If necessary, we disable mipmapping in the sampler descriptor by
 380                  * clamping the LOD as tight as possible (from 0 to epsilon,
 381                  * essentially -- remember these are fixed point numbers, so
 382                  * epsilon=1/256) */
 383
 384                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
 385                         cfg.minimum_lod + 1 :
 386                         FIXED_16(cso->max_lod, false);
 387
 388                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 389                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 390                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 391
 392                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 393                 cfg.seamless_cube_map = cso->seamless_cube_map;
 394
 395                 cfg.border_color_r = cso->border_color.f[0];
 396                 cfg.border_color_g = cso->border_color.f[1];
 397                 cfg.border_color_b = cso->border_color.f[2];
 398                 cfg.border_color_a = cso->border_color.f[3];
 399         }
 400 }
 401
 402 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
 403                                         struct mali_bifrost_sampler_packed *hw)
 404 {
 405         pan_pack(hw, BIFROST_SAMPLER, cfg) {
 406                 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
 407                 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
 408                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
 409                 cfg.normalized_coordinates = cso->normalized_coords;
 410
 411                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
 412                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
 413                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
 414
 415                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
 416                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
 417                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
 418
 419                 cfg.compare_function = panfrost_sampler_compare_func(cso);
 420                 cfg.seamless_cube_map = cso->seamless_cube_map;
 421         }
 422 }
 423
 424 static bool
 425 panfrost_fs_required(
 426                 struct panfrost_shader_state *fs,
 427                 struct panfrost_blend_final *blend,
 428                 unsigned rt_count)
 429 {
 430         /* If we generally have side effects */
 431         if (fs->fs_sidefx)
 432                 return true;
 433
 434         /* If colour is written we need to execute */
 435         for (unsigned i = 0; i < rt_count; ++i) {
 436                 if (!blend[i].no_colour)
 437                         return true;
 438         }
 439
 440         /* If depth is written and not implied we need to execute.
 441          * TODO: Predicate on Z/S writes being enabled */
 442         return (fs->writes_depth || fs->writes_stencil);
 443 }
 444
 445 static void
 446 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
 447                 struct panfrost_blend_final *blend)
 448 {
 449         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 450         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
 451         unsigned rt_count = batch->key.nr_cbufs;
 452
 453         struct bifrost_blend_rt *brts = rts;
 454         struct midgard_blend_rt *mrts = rts;
 455
 456         /* Disable blending for depth-only on Bifrost */
 457
 458         if (rt_count == 0 && dev->quirks & IS_BIFROST)
 459                 brts[0].unk2 = 0x3;
 460
 461         for (unsigned i = 0; i < rt_count; ++i) {
 462                 unsigned flags = 0;
 463
 464                 pan_pack(&flags, BLEND_FLAGS, cfg) {
 465                         if (blend[i].no_colour) {
 466                                 cfg.enable = false;
 467                                 break;
 468                         }
 469
 470                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
 471
 472                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
 473                         cfg.load_destination = blend[i].load_dest;
 474                         cfg.dither_disable = !batch->ctx->blend->base.dither;
 475
 476                         if (!(dev->quirks & IS_BIFROST))
 477                                 cfg.midgard_blend_shader = blend[i].is_shader;
 478                 }
 479
 480                 if (dev->quirks & IS_BIFROST) {
 481                         brts[i].flags = flags;
 482
 483                         if (blend[i].is_shader) {
 484                                 /* The blend shader's address needs to be at
 485                                  * the same top 32 bit as the fragment shader.
 486                                  * TODO: Ensure that's always the case.
 487                                  */
 488                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
 489                                        (fs->bo->gpu & (0xffffffffull << 32)));
 490                                 brts[i].shader = blend[i].shader.gpu;
 491                                 brts[i].unk2 = 0x0;
 492                         } else {
 493                                 enum pipe_format format = batch->key.cbufs[i]->format;
 494                                 const struct util_format_description *format_desc;
 495                                 format_desc = util_format_description(format);
 496
 497                                 brts[i].equation = blend[i].equation.equation;
 498
 499                                 /* TODO: this is a bit more complicated */
 500                                 brts[i].constant = blend[i].equation.constant;
 501
 502                                 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
 503
 504                                 /* 0x19 disables blending and forces REPLACE
 505                                  * mode (equivalent to rgb_mode = alpha_mode =
 506                                  * x122, colour mask = 0xF). 0x1a allows
 507                                  * blending. */
 508                                 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
 509
 510                                 brts[i].shader_type = fs->blend_types[i];
 511                         }
 512                 } else {
 513                         memcpy(&mrts[i].flags, &flags, sizeof(flags));
 514
 515                         if (blend[i].is_shader) {
 516                                 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
 517                         } else {
 518                                 mrts[i].blend.equation = blend[i].equation.equation;
 519                                 mrts[i].blend.constant = blend[i].equation.constant;
 520                         }
 521                 }
 522         }
 523 }
 524
 525 static struct mali_shader_packed
 526 panfrost_pack_shaderless(bool midgard)
 527 {
 528         struct mali_shader_packed pack;
 529
 530         pan_pack(&pack, SHADER, cfg) {
 531                 cfg.shader = midgard ? 0x1 : 0x0;
 532         }
 533
 534         return pack;
 535 }
 536
 537 static void
 538 panfrost_emit_frag_shader(struct panfrost_context *ctx,
 539                                struct mali_shader_meta *fragmeta,
 540                                struct panfrost_blend_final *blend)
 541 {
 542         const struct panfrost_device *dev = pan_device(ctx->base.screen);
 543         struct panfrost_shader_state *fs;
 544
 545         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
 546
 547         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 548         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
 549         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
 550
 551         memset(fragmeta, 0, sizeof(*fragmeta));
 552         memcpy(&fragmeta->shader, &fs->shader, sizeof(fs->shader));
 553
 554         if (dev->quirks & IS_BIFROST) {
 555                 struct mali_bifrost_properties_packed prop;
 556
 557                 bool no_blend = true;
 558
 559                 for (unsigned i = 0; i < rt_count; ++i)
 560                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
 561
 562                 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 563                         cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
 564                 }
 565
 566                 /* Combine with prepacked properties */
 567                 prop.opaque[0] |= fs->properties.opaque[0];
 568
 569                 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
 570                 memcpy(&fragmeta->bifrost_preload, &fs->preload, sizeof(fs->preload));
 571         } else {
 572                 struct mali_midgard_properties_packed prop;
 573
 574                 /* Reasons to disable early-Z from a shader perspective */
 575                 bool late_z = fs->can_discard || fs->writes_global ||
 576                         fs->writes_depth || fs->writes_stencil;
 577
 578                 /* Reasons to disable early-Z from a CSO perspective */
 579                 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
 580
 581                 /* If either depth or stencil is enabled, discard matters */
 582                 bool zs_enabled =
 583                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
 584                         zsa->base.stencil[0].enabled;
 585
 586                 bool has_blend_shader = false;
 587
 588                 for (unsigned c = 0; c < rt_count; ++c)
 589                         has_blend_shader |= blend[c].is_shader;
 590
 591                 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 592                         /* TODO: Reduce this limit? */
 593                         if (has_blend_shader)
 594                                 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
 595                         else
 596                                 cfg.work_register_count = fs->work_reg_count;
 597
 598                         cfg.early_z_enable = !(late_z || alpha_to_coverage);
 599                         cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
 600                         cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
 601                 }
 602
 603                 /* Combine with prepacked properties */
 604                 prop.opaque[0] |= fs->properties.opaque[0];
 605                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 606         }
 607
 608         bool msaa = rast->multisample;
 609         fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
 610
 611         fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
 612         fragmeta->unknown2_4 = 0x4e0;
 613
 614         /* TODO: Sample size */
 615         SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
 616         SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
 617
 618         /* EXT_shader_framebuffer_fetch requires the shader to be run
 619          * per-sample when outputs are read. */
 620         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
 621         SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
 622
 623         fragmeta->depth_units = rast->offset_units * 2.0f;
 624         fragmeta->depth_factor = rast->offset_scale;
 625
 626         /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
 627
 628         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
 629         SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
 630
 631         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
 632         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
 633
 634         SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
 635                 zsa->base.stencil[0].enabled);
 636
 637         fragmeta->stencil_mask_front = zsa->stencil_mask_front;
 638         fragmeta->stencil_mask_back = zsa->stencil_mask_back;
 639
 640         /* Bottom bits for stencil ref, exactly one word */
 641         fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
 642
 643         /* If back-stencil is not enabled, use the front values */
 644
 645         if (zsa->base.stencil[1].enabled)
 646                 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
 647         else
 648                 fragmeta->stencil_back = fragmeta->stencil_front;
 649
 650         SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
 651                 zsa->base.depth.writemask);
 652
 653         fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
 654         fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
 655                 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
 656
 657         SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
 658                         ctx->blend->base.alpha_to_coverage);
 659
 660         /* Disable shader execution if we can */
 661         if (!panfrost_fs_required(fs, blend, rt_count)) {
 662                 struct mali_shader_packed shader =
 663                         panfrost_pack_shaderless(!(dev->quirks & IS_BIFROST));
 664
 665                 memcpy(&fragmeta->shader, &shader, sizeof(shader));
 666
 667                 struct mali_midgard_properties_packed prop;
 668
 669                 if (dev->quirks & IS_BIFROST) {
 670                         pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
 671                                 cfg.unknown = 0x950020; /* XXX */
 672                                 cfg.early_z_enable = true;
 673                         }
 674                 } else {
 675                         pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
 676                                 cfg.work_register_count = 1;
 677                                 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
 678                                 cfg.early_z_enable = true;
 679                         }
 680                 }
 681
 682                 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
 683         }
 684
 685         if (dev->quirks & MIDGARD_SFBD) {
 686                 /* When only a single render target platform is used, the blend
 687                  * information is inside the shader meta itself. We additionally
 688                  * need to signal CAN_DISCARD for nontrivial blend modes (so
 689                  * we're able to read back the destination buffer) */
 690
 691                 if (blend[0].no_colour)
 692                         return;
 693
 694                 fragmeta->unknown2_4 |= MALI_SFBD_ENABLE;
 695
 696                 SET_BIT(fragmeta->unknown2_4, MALI_SFBD_SRGB,
 697                                 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format));
 698
 699                 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
 700                         blend[0].is_shader);
 701
 702                 if (blend[0].is_shader) {
 703                         fragmeta->blend.shader = blend[0].shader.gpu |
 704                                 blend[0].shader.first_tag;
 705                 } else {
 706                         fragmeta->blend.equation = blend[0].equation.equation;
 707                         fragmeta->blend.constant = blend[0].equation.constant;
 708                 }
 709
 710                 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
 711                         blend[0].load_dest);
 712
 713                 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
 714         } else if (!(dev->quirks & IS_BIFROST)) {
 715                 /* Bug where MRT-capable hw apparently reads the last blend
 716                  * shader from here instead of the usual location? */
 717
 718                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
 719                         if (!blend[rt].is_shader)
 720                                 continue;
 721
 722                         fragmeta->blend.shader = blend[rt].shader.gpu |
 723                                                  blend[rt].shader.first_tag;
 724                         break;
 725                 }
 726         }
 727 }
 728
 729 void
 730 panfrost_emit_shader_meta(struct panfrost_batch *batch,
 731                           enum pipe_shader_type st,
 732                           struct mali_vertex_tiler_postfix *postfix)
 733 {
 734         struct panfrost_context *ctx = batch->ctx;
 735         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
 736
 737         if (!ss) {
 738                 postfix->shader = 0;
 739                 return;
 740         }
 741
 742         struct mali_shader_meta meta;
 743
 744         /* Add the shader BO to the batch. */
 745         panfrost_batch_add_bo(batch, ss->bo,
 746                               PAN_BO_ACCESS_PRIVATE |
 747                               PAN_BO_ACCESS_READ |
 748                               panfrost_bo_access_for_stage(st));
 749
 750         mali_ptr shader_ptr;
 751
 752         if (st == PIPE_SHADER_FRAGMENT) {
 753                 struct panfrost_device *dev = pan_device(ctx->base.screen);
 754                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 755                 size_t desc_size = sizeof(meta);
 756                 void *rts = NULL;
 757                 struct panfrost_transfer xfer;
 758                 unsigned rt_size;
 759
 760                 if (dev->quirks & MIDGARD_SFBD)
 761                         rt_size = 0;
 762                 else if (dev->quirks & IS_BIFROST)
 763                         rt_size = sizeof(struct bifrost_blend_rt);
 764                 else
 765                         rt_size = sizeof(struct midgard_blend_rt);
 766
 767                 desc_size += rt_size * rt_count;
 768
 769                 if (rt_size)
 770                         rts = rzalloc_size(ctx, rt_size * rt_count);
 771
 772                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
 773
 774                 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
 775                         blend[c] = panfrost_get_blend_for_context(ctx, c);
 776
 777                 panfrost_emit_frag_shader(ctx, &meta, blend);
 778
 779                 if (!(dev->quirks & MIDGARD_SFBD))
 780                         panfrost_emit_blend(batch, rts, blend);
 781                 else
 782                         batch->draws |= PIPE_CLEAR_COLOR0;
 783
 784                 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
 785
 786                 memcpy(xfer.cpu, &meta, sizeof(meta));
 787                 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
 788
 789                 if (rt_size)
 790                         ralloc_free(rts);
 791
 792                 shader_ptr = xfer.gpu;
 793         } else {
 794                 panfrost_emit_compute_shader(ctx, st, &meta);
 795
 796                 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
 797                                                        sizeof(meta));
 798         }
 799
 800         postfix->shader = shader_ptr;
 801 }
 802
 803 void
 804 panfrost_emit_viewport(struct panfrost_batch *batch,
 805                        struct mali_vertex_tiler_postfix *tiler_postfix)
 806 {
 807         struct panfrost_context *ctx = batch->ctx;
 808         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 809         const struct pipe_scissor_state *ss = &ctx->scissor;
 810         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 811         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
 812
 813         /* Derive min/max from translate/scale. Note since |x| >= 0 by
 814          * definition, we have that -|x| <= |x| hence translate - |scale| <=
 815          * translate + |scale|, so the ordering is correct here. */
 816         float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
 817         float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
 818         float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
 819         float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
 820         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
 821         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
 822
 823         /* Scissor to the intersection of viewport and to the scissor, clamped
 824          * to the framebuffer */
 825
 826         unsigned minx = MIN2(fb->width, vp_minx);
 827         unsigned maxx = MIN2(fb->width, vp_maxx);
 828         unsigned miny = MIN2(fb->height, vp_miny);
 829         unsigned maxy = MIN2(fb->height, vp_maxy);
 830
 831         if (ss && rast->scissor) {
 832                 minx = MAX2(ss->minx, minx);
 833                 miny = MAX2(ss->miny, miny);
 834                 maxx = MIN2(ss->maxx, maxx);
 835                 maxy = MIN2(ss->maxy, maxy);
 836         }
 837
 838         struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
 839
 840         pan_pack(T.cpu, VIEWPORT, cfg) {
 841                 cfg.scissor_minimum_x = minx;
 842                 cfg.scissor_minimum_y = miny;
 843                 cfg.scissor_maximum_x = maxx - 1;
 844                 cfg.scissor_maximum_y = maxy - 1;
 845
 846                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
 847                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
 848         }
 849
 850         tiler_postfix->viewport = T.gpu;
 851         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
 852 }
 853
 854 static mali_ptr
 855 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
 856                                  enum pipe_shader_type st,
 857                                  struct panfrost_constant_buffer *buf,
 858                                  unsigned index)
 859 {
 860         struct pipe_constant_buffer *cb = &buf->cb[index];
 861         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
 862
 863         if (rsrc) {
 864                 panfrost_batch_add_bo(batch, rsrc->bo,
 865                                       PAN_BO_ACCESS_SHARED |
 866                                       PAN_BO_ACCESS_READ |
 867                                       panfrost_bo_access_for_stage(st));
 868
 869                 /* Alignment gauranteed by
 870                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
 871                 return rsrc->bo->gpu + cb->buffer_offset;
 872         } else if (cb->user_buffer) {
 873                 return panfrost_pool_upload_aligned(&batch->pool,
 874                                                  cb->user_buffer +
 875                                                  cb->buffer_offset,
 876                                                  cb->buffer_size, 16);
 877         } else {
 878                 unreachable("No constant buffer");
 879         }
 880 }
 881
 882 struct sysval_uniform {
 883         union {
 884                 float f[4];
 885                 int32_t i[4];
 886                 uint32_t u[4];
 887                 uint64_t du[2];
 888         };
 889 };
 890
 891 static void
 892 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
 893                                       struct sysval_uniform *uniform)
 894 {
 895         struct panfrost_context *ctx = batch->ctx;
 896         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 897
 898         uniform->f[0] = vp->scale[0];
 899         uniform->f[1] = vp->scale[1];
 900         uniform->f[2] = vp->scale[2];
 901 }
 902
 903 static void
 904 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
 905                                        struct sysval_uniform *uniform)
 906 {
 907         struct panfrost_context *ctx = batch->ctx;
 908         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
 909
 910         uniform->f[0] = vp->translate[0];
 911         uniform->f[1] = vp->translate[1];
 912         uniform->f[2] = vp->translate[2];
 913 }
 914
 915 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
 916                                        enum pipe_shader_type st,
 917                                        unsigned int sysvalid,
 918                                        struct sysval_uniform *uniform)
 919 {
 920         struct panfrost_context *ctx = batch->ctx;
 921         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
 922         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
 923         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
 924         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
 925
 926         assert(dim);
 927         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
 928
 929         if (dim > 1)
 930                 uniform->i[1] = u_minify(tex->texture->height0,
 931                                          tex->u.tex.first_level);
 932
 933         if (dim > 2)
 934                 uniform->i[2] = u_minify(tex->texture->depth0,
 935                                          tex->u.tex.first_level);
 936
 937         if (is_array)
 938                 uniform->i[dim] = tex->texture->array_size;
 939 }
 940
 941 static void
 942 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
 943                             enum pipe_shader_type st,
 944                             unsigned ssbo_id,
 945                             struct sysval_uniform *uniform)
 946 {
 947         struct panfrost_context *ctx = batch->ctx;
 948
 949         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
 950         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
 951
 952         /* Compute address */
 953         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
 954
 955         panfrost_batch_add_bo(batch, bo,
 956                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
 957                               panfrost_bo_access_for_stage(st));
 958
 959         /* Upload address and size as sysval */
 960         uniform->du[0] = bo->gpu + sb.buffer_offset;
 961         uniform->u[2] = sb.buffer_size;
 962 }
 963
 964 static void
 965 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
 966                                enum pipe_shader_type st,
 967                                unsigned samp_idx,
 968                                struct sysval_uniform *uniform)
 969 {
 970         struct panfrost_context *ctx = batch->ctx;
 971         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
 972
 973         uniform->f[0] = sampl->min_lod;
 974         uniform->f[1] = sampl->max_lod;
 975         uniform->f[2] = sampl->lod_bias;
 976
 977         /* Even without any errata, Midgard represents "no mipmapping" as
 978          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
 979          * panfrost_create_sampler_state which also explains our choice of
 980          * epsilon value (again to keep behaviour consistent) */
 981
 982         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
 983                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
 984 }
 985
 986 static void
 987 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
 988                                        struct sysval_uniform *uniform)
 989 {
 990         struct panfrost_context *ctx = batch->ctx;
 991
 992         uniform->u[0] = ctx->compute_grid->grid[0];
 993         uniform->u[1] = ctx->compute_grid->grid[1];
 994         uniform->u[2] = ctx->compute_grid->grid[2];
 995 }
 996
 997 static void
 998 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
 999                         struct panfrost_shader_state *ss,
1000                         enum pipe_shader_type st)
1001 {
1002         struct sysval_uniform *uniforms = (void *)buf;
1003
1004         for (unsigned i = 0; i < ss->sysval_count; ++i) {
1005                 int sysval = ss->sysval[i];
1006
1007                 switch (PAN_SYSVAL_TYPE(sysval)) {
1008                 case PAN_SYSVAL_VIEWPORT_SCALE:
1009                         panfrost_upload_viewport_scale_sysval(batch,
1010                                                               &uniforms[i]);
1011                         break;
1012                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1013                         panfrost_upload_viewport_offset_sysval(batch,
1014                                                                &uniforms[i]);
1015                         break;
1016                 case PAN_SYSVAL_TEXTURE_SIZE:
1017                         panfrost_upload_txs_sysval(batch, st,
1018                                                    PAN_SYSVAL_ID(sysval),
1019                                                    &uniforms[i]);
1020                         break;
1021                 case PAN_SYSVAL_SSBO:
1022                         panfrost_upload_ssbo_sysval(batch, st,
1023                                                     PAN_SYSVAL_ID(sysval),
1024                                                     &uniforms[i]);
1025                         break;
1026                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1027                         panfrost_upload_num_work_groups_sysval(batch,
1028                                                                &uniforms[i]);
1029                         break;
1030                 case PAN_SYSVAL_SAMPLER:
1031                         panfrost_upload_sampler_sysval(batch, st,
1032                                                        PAN_SYSVAL_ID(sysval),
1033                                                        &uniforms[i]);
1034                         break;
1035                 default:
1036                         assert(0);
1037                 }
1038         }
1039 }
1040
1041 static const void *
1042 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1043                                  unsigned index)
1044 {
1045         struct pipe_constant_buffer *cb = &buf->cb[index];
1046         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1047
1048         if (rsrc)
1049                 return rsrc->bo->cpu;
1050         else if (cb->user_buffer)
1051                 return cb->user_buffer;
1052         else
1053                 unreachable("No constant buffer");
1054 }
1055
1056 void
1057 panfrost_emit_const_buf(struct panfrost_batch *batch,
1058                         enum pipe_shader_type stage,
1059                         struct mali_vertex_tiler_postfix *postfix)
1060 {
1061         struct panfrost_context *ctx = batch->ctx;
1062         struct panfrost_shader_variants *all = ctx->shader[stage];
1063
1064         if (!all)
1065                 return;
1066
1067         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1068
1069         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1070
1071         /* Uniforms are implicitly UBO #0 */
1072         bool has_uniforms = buf->enabled_mask & (1 << 0);
1073
1074         /* Allocate room for the sysval and the uniforms */
1075         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1076         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1077         size_t size = sys_size + uniform_size;
1078         struct panfrost_transfer transfer =
1079                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1080
1081         /* Upload sysvals requested by the shader */
1082         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1083
1084         /* Upload uniforms */
1085         if (has_uniforms && uniform_size) {
1086                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1087                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1088         }
1089
1090         /* Next up, attach UBOs. UBO #0 is the uniforms we just
1091          * uploaded, so it's always included. The count is the highest UBO
1092          * addressable -- gaps are included. */
1093
1094         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1095
1096         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1097         struct panfrost_transfer ubos =
1098                 panfrost_pool_alloc_aligned(&batch->pool, sz,
1099                                 MALI_UNIFORM_BUFFER_LENGTH);
1100
1101         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1102
1103         /* Upload uniforms as a UBO */
1104
1105         if (size) {
1106                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1107                         cfg.entries = DIV_ROUND_UP(size, 16);
1108                         cfg.pointer = transfer.gpu;
1109                 }
1110         } else {
1111                 *ubo_ptr = 0;
1112         }
1113
1114         /* The rest are honest-to-goodness UBOs */
1115
1116         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1117                 size_t usz = buf->cb[ubo].buffer_size;
1118                 bool enabled = buf->enabled_mask & (1 << ubo);
1119                 bool empty = usz == 0;
1120
1121                 if (!enabled || empty) {
1122                         ubo_ptr[ubo] = 0;
1123                         continue;
1124                 }
1125
1126                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1127                         cfg.entries = DIV_ROUND_UP(usz, 16);
1128                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1129                                         stage, buf, ubo);
1130                 }
1131         }
1132
1133         postfix->uniforms = transfer.gpu;
1134         postfix->uniform_buffers = ubos.gpu;
1135
1136         buf->dirty_mask = 0;
1137 }
1138
1139 void
1140 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1141                             const struct pipe_grid_info *info,
1142                             struct midgard_payload_vertex_tiler *vtp)
1143 {
1144         struct panfrost_context *ctx = batch->ctx;
1145         struct panfrost_device *dev = pan_device(ctx->base.screen);
1146         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1147         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1148         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1149                                                            128));
1150
1151         unsigned log2_instances =
1152                 util_logbase2_ceil(info->grid[0]) +
1153                 util_logbase2_ceil(info->grid[1]) +
1154                 util_logbase2_ceil(info->grid[2]);
1155
1156         unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1157         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1158                                                                   shared_size,
1159                                                                   1);
1160
1161         struct mali_shared_memory shared = {
1162                 .shared_memory = bo->gpu,
1163                 .shared_workgroup_count = log2_instances,
1164                 .shared_shift = util_logbase2(single_size) + 1
1165         };
1166
1167         vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1168                                                                sizeof(shared), 64);
1169 }
1170
1171 static mali_ptr
1172 panfrost_get_tex_desc(struct panfrost_batch *batch,
1173                       enum pipe_shader_type st,
1174                       struct panfrost_sampler_view *view)
1175 {
1176         if (!view)
1177                 return (mali_ptr) 0;
1178
1179         struct pipe_sampler_view *pview = &view->base;
1180         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1181
1182         /* Add the BO to the job so it's retained until the job is done. */
1183
1184         panfrost_batch_add_bo(batch, rsrc->bo,
1185                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1186                               panfrost_bo_access_for_stage(st));
1187
1188         panfrost_batch_add_bo(batch, view->bo,
1189                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1190                               panfrost_bo_access_for_stage(st));
1191
1192         return view->bo->gpu;
1193 }
1194
1195 static void
1196 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1197                              struct pipe_context *pctx)
1198 {
1199         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1200         if (view->texture_bo != rsrc->bo->gpu ||
1201             view->modifier != rsrc->modifier) {
1202                 panfrost_bo_unreference(view->bo);
1203                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1204         }
1205 }
1206
1207 void
1208 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1209                                   enum pipe_shader_type stage,
1210                                   struct mali_vertex_tiler_postfix *postfix)
1211 {
1212         struct panfrost_context *ctx = batch->ctx;
1213         struct panfrost_device *device = pan_device(ctx->base.screen);
1214
1215         if (!ctx->sampler_view_count[stage])
1216                 return;
1217
1218         if (device->quirks & IS_BIFROST) {
1219                 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1220                                 MALI_BIFROST_TEXTURE_LENGTH *
1221                                 ctx->sampler_view_count[stage],
1222                                 MALI_BIFROST_TEXTURE_LENGTH);
1223
1224                 struct mali_bifrost_texture_packed *out =
1225                         (struct mali_bifrost_texture_packed *) T.cpu;
1226
1227                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1228                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1229                         struct pipe_sampler_view *pview = &view->base;
1230                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1231
1232                         panfrost_update_sampler_view(view, &ctx->base);
1233                         out[i] = view->bifrost_descriptor;
1234
1235                         /* Add the BOs to the job so they are retained until the job is done. */
1236
1237                         panfrost_batch_add_bo(batch, rsrc->bo,
1238                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1239                                               panfrost_bo_access_for_stage(stage));
1240
1241                         panfrost_batch_add_bo(batch, view->bo,
1242                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1243                                               panfrost_bo_access_for_stage(stage));
1244                 }
1245
1246                 postfix->textures = T.gpu;
1247         } else {
1248                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1249
1250                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1251                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1252
1253                         panfrost_update_sampler_view(view, &ctx->base);
1254
1255                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1256                 }
1257
1258                 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1259                                                               trampolines,
1260                                                               sizeof(uint64_t) *
1261                                                               ctx->sampler_view_count[stage],
1262                                                               sizeof(uint64_t));
1263         }
1264 }
1265
1266 void
1267 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1268                                   enum pipe_shader_type stage,
1269                                   struct mali_vertex_tiler_postfix *postfix)
1270 {
1271         struct panfrost_context *ctx = batch->ctx;
1272
1273         if (!ctx->sampler_count[stage])
1274                 return;
1275
1276         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1277         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1278
1279         size_t sz = desc_size * ctx->sampler_count[stage];
1280         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1281         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1282
1283         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1284                 out[i] = ctx->samplers[stage][i]->hw;
1285
1286         postfix->sampler_descriptor = T.gpu;
1287 }
1288
1289 void
1290 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1291                           struct mali_vertex_tiler_postfix *vertex_postfix)
1292 {
1293         struct panfrost_context *ctx = batch->ctx;
1294         struct panfrost_vertex_state *so = ctx->vertex;
1295         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1296
1297         unsigned instance_shift = vertex_postfix->instance_shift;
1298         unsigned instance_odd = vertex_postfix->instance_odd;
1299
1300         /* Worst case: everything is NPOT, which is only possible if instancing
1301          * is enabled. Otherwise single record is gauranteed */
1302         bool could_npot = instance_shift || instance_odd;
1303
1304         struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1305                         MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1306                         (could_npot ? 2 : 1),
1307                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1308
1309         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1310                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1311                         MALI_ATTRIBUTE_LENGTH);
1312
1313         struct mali_attribute_buffer_packed *bufs =
1314                 (struct mali_attribute_buffer_packed *) S.cpu;
1315
1316         struct mali_attribute_packed *out =
1317                 (struct mali_attribute_packed *) T.cpu;
1318
1319         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1320         unsigned k = 0;
1321
1322         for (unsigned i = 0; i < so->num_elements; ++i) {
1323                 /* We map buffers 1:1 with the attributes, which
1324                  * means duplicating some vertex buffers (who cares? aside from
1325                  * maybe some caching implications but I somehow doubt that
1326                  * matters) */
1327
1328                 struct pipe_vertex_element *elem = &so->pipe[i];
1329                 unsigned vbi = elem->vertex_buffer_index;
1330                 attrib_to_buffer[i] = k;
1331
1332                 if (!(ctx->vb_mask & (1 << vbi)))
1333                         continue;
1334
1335                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1336                 struct panfrost_resource *rsrc;
1337
1338                 rsrc = pan_resource(buf->buffer.resource);
1339                 if (!rsrc)
1340                         continue;
1341
1342                 /* Add a dependency of the batch on the vertex buffer */
1343                 panfrost_batch_add_bo(batch, rsrc->bo,
1344                                       PAN_BO_ACCESS_SHARED |
1345                                       PAN_BO_ACCESS_READ |
1346                                       PAN_BO_ACCESS_VERTEX_TILER);
1347
1348                 /* Mask off lower bits, see offset fixup below */
1349                 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1350                 mali_ptr addr = raw_addr & ~63;
1351
1352                 /* Since we advanced the base pointer, we shrink the buffer
1353                  * size, but add the offset we subtracted */
1354                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1355                         - buf->buffer_offset;
1356
1357                 /* When there is a divisor, the hardware-level divisor is
1358                  * the product of the instance divisor and the padded count */
1359                 unsigned divisor = elem->instance_divisor;
1360                 unsigned hw_divisor = ctx->padded_count * divisor;
1361                 unsigned stride = buf->stride;
1362
1363                 /* If there's a divisor(=1) but no instancing, we want every
1364                  * attribute to be the same */
1365
1366                 if (divisor && ctx->instance_count == 1)
1367                         stride = 0;
1368
1369                 if (!divisor || ctx->instance_count <= 1) {
1370                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1371                                 if (ctx->instance_count > 1)
1372                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1373
1374                                 cfg.pointer = addr;
1375                                 cfg.stride = stride;
1376                                 cfg.size = size;
1377                                 cfg.divisor_r = instance_shift;
1378                                 cfg.divisor_p = instance_odd;
1379                         }
1380                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1381                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1382                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1383                                 cfg.pointer = addr;
1384                                 cfg.stride = stride;
1385                                 cfg.size = size;
1386                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1387                         }
1388
1389                 } else {
1390                         unsigned shift = 0, extra_flags = 0;
1391
1392                         unsigned magic_divisor =
1393                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1394
1395                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1396                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1397                                 cfg.pointer = addr;
1398                                 cfg.stride = stride;
1399                                 cfg.size = size;
1400
1401                                 cfg.divisor_r = shift;
1402                                 cfg.divisor_e = extra_flags;
1403                         }
1404
1405                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1406                                 cfg.divisor_numerator = magic_divisor;
1407                                 cfg.divisor = divisor;
1408                         }
1409
1410                         ++k;
1411                 }
1412
1413                 ++k;
1414         }
1415
1416         /* Add special gl_VertexID/gl_InstanceID buffers */
1417
1418         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1419                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1420
1421                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1422                         cfg.buffer_index = k++;
1423                         cfg.format = so->formats[PAN_VERTEX_ID];
1424                 }
1425
1426                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1427
1428                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1429                         cfg.buffer_index = k++;
1430                         cfg.format = so->formats[PAN_INSTANCE_ID];
1431                 }
1432         }
1433
1434         /* Attribute addresses require 64-byte alignment, so let:
1435          *
1436          *      base' = base & ~63 = base - (base & 63)
1437          *      offset' = offset + (base & 63)
1438          *
1439          * Since base' + offset' = base + offset, these are equivalent
1440          * addressing modes and now base is 64 aligned.
1441          */
1442
1443         unsigned start = vertex_postfix->offset_start;
1444
1445         for (unsigned i = 0; i < so->num_elements; ++i) {
1446                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1447                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1448
1449                 /* Adjust by the masked off bits of the offset. Make sure we
1450                  * read src_offset from so->hw (which is not GPU visible)
1451                  * rather than target (which is) due to caching effects */
1452
1453                 unsigned src_offset = so->pipe[i].src_offset;
1454
1455                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1456                 src_offset += (buf->buffer_offset & 63);
1457
1458                 /* Also, somewhat obscurely per-instance data needs to be
1459                  * offset in response to a delayed start in an indexed draw */
1460
1461                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1462                         src_offset -= buf->stride * start;
1463
1464                 pan_pack(out + i, ATTRIBUTE, cfg) {
1465                         cfg.buffer_index = attrib_to_buffer[i];
1466                         cfg.format = so->formats[i];
1467                         cfg.offset = src_offset;
1468                 }
1469         }
1470
1471         vertex_postfix->attributes = S.gpu;
1472         vertex_postfix->attribute_meta = T.gpu;
1473 }
1474
1475 static mali_ptr
1476 panfrost_emit_varyings(struct panfrost_batch *batch,
1477                 struct mali_attribute_buffer_packed *slot,
1478                 unsigned stride, unsigned count)
1479 {
1480         unsigned size = stride * count;
1481         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1482
1483         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1484                 cfg.stride = stride;
1485                 cfg.size = size;
1486                 cfg.pointer = ptr;
1487         }
1488
1489         return ptr;
1490 }
1491
1492 static unsigned
1493 panfrost_streamout_offset(unsigned stride, unsigned offset,
1494                         struct pipe_stream_output_target *target)
1495 {
1496         return (target->buffer_offset + (offset * stride * 4)) & 63;
1497 }
1498
1499 static void
1500 panfrost_emit_streamout(struct panfrost_batch *batch,
1501                         struct mali_attribute_buffer_packed *slot,
1502                         unsigned stride_words, unsigned offset, unsigned count,
1503                         struct pipe_stream_output_target *target)
1504 {
1505         unsigned stride = stride_words * 4;
1506         unsigned max_size = target->buffer_size;
1507         unsigned expected_size = stride * count;
1508
1509         /* Grab the BO and bind it to the batch */
1510         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1511
1512         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1513          * the perspective of the TILER and FRAGMENT.
1514          */
1515         panfrost_batch_add_bo(batch, bo,
1516                               PAN_BO_ACCESS_SHARED |
1517                               PAN_BO_ACCESS_RW |
1518                               PAN_BO_ACCESS_VERTEX_TILER |
1519                               PAN_BO_ACCESS_FRAGMENT);
1520
1521         /* We will have an offset applied to get alignment */
1522         mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1523
1524         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1525                 cfg.pointer = (addr & ~63);
1526                 cfg.stride = stride;
1527                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1528         }
1529 }
1530
1531 static bool
1532 has_point_coord(unsigned mask, gl_varying_slot loc)
1533 {
1534         if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1535                 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1536         else if (loc == VARYING_SLOT_PNTC)
1537                 return (mask & (1 << 8));
1538         else
1539                 return false;
1540 }
1541
1542 /* Helpers for manipulating stream out information so we can pack varyings
1543  * accordingly. Compute the src_offset for a given captured varying */
1544
1545 static struct pipe_stream_output *
1546 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1547 {
1548         for (unsigned i = 0; i < info->num_outputs; ++i) {
1549                 if (info->output[i].register_index == loc)
1550                         return &info->output[i];
1551         }
1552
1553         unreachable("Varying not captured");
1554 }
1555
1556 static unsigned
1557 pan_varying_size(enum mali_format fmt)
1558 {
1559         unsigned type = MALI_EXTRACT_TYPE(fmt);
1560         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1561         unsigned bits = MALI_EXTRACT_BITS(fmt);
1562         unsigned bpc = 0;
1563
1564         if (bits == MALI_CHANNEL_FLOAT) {
1565                 /* No doubles */
1566                 bool fp16 = (type == MALI_FORMAT_SINT);
1567                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1568
1569                 bpc = fp16 ? 2 : 4;
1570         } else {
1571                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1572
1573                 /* See the enums */
1574                 bits = 1 << bits;
1575                 assert(bits >= 8);
1576                 bpc = bits / 8;
1577         }
1578
1579         return bpc * chan;
1580 }
1581
1582 /* Indices for named (non-XFB) varyings that are present. These are packed
1583  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1584  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1585  * of a given special field given a shift S by:
1586  *
1587  *      idx = popcount(P & ((1 << S) - 1))
1588  *
1589  * That is... look at all of the varyings that come earlier and count them, the
1590  * count is the new index since plus one. Likewise, the total number of special
1591  * buffers required is simply popcount(P)
1592  */
1593
1594 enum pan_special_varying {
1595         PAN_VARY_GENERAL = 0,
1596         PAN_VARY_POSITION = 1,
1597         PAN_VARY_PSIZ = 2,
1598         PAN_VARY_PNTCOORD = 3,
1599         PAN_VARY_FACE = 4,
1600         PAN_VARY_FRAGCOORD = 5,
1601
1602         /* Keep last */
1603         PAN_VARY_MAX,
1604 };
1605
1606 /* Given a varying, figure out which index it correpsonds to */
1607
1608 static inline unsigned
1609 pan_varying_index(unsigned present, enum pan_special_varying v)
1610 {
1611         unsigned mask = (1 << v) - 1;
1612         return util_bitcount(present & mask);
1613 }
1614
1615 /* Get the base offset for XFB buffers, which by convention come after
1616  * everything else. Wrapper function for semantic reasons; by construction this
1617  * is just popcount. */
1618
1619 static inline unsigned
1620 pan_xfb_base(unsigned present)
1621 {
1622         return util_bitcount(present);
1623 }
1624
1625 /* Computes the present mask for varyings so we can start emitting varying records */
1626
1627 static inline unsigned
1628 pan_varying_present(
1629         struct panfrost_shader_state *vs,
1630         struct panfrost_shader_state *fs,
1631         unsigned quirks)
1632 {
1633         /* At the moment we always emit general and position buffers. Not
1634          * strictly necessary but usually harmless */
1635
1636         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1637
1638         /* Enable special buffers by the shader info */
1639
1640         if (vs->writes_point_size)
1641                 present |= (1 << PAN_VARY_PSIZ);
1642
1643         if (fs->reads_point_coord)
1644                 present |= (1 << PAN_VARY_PNTCOORD);
1645
1646         if (fs->reads_face)
1647                 present |= (1 << PAN_VARY_FACE);
1648
1649         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1650                 present |= (1 << PAN_VARY_FRAGCOORD);
1651
1652         /* Also, if we have a point sprite, we need a point coord buffer */
1653
1654         for (unsigned i = 0; i < fs->varying_count; i++)  {
1655                 gl_varying_slot loc = fs->varyings_loc[i];
1656
1657                 if (has_point_coord(fs->point_sprite_mask, loc))
1658                         present |= (1 << PAN_VARY_PNTCOORD);
1659         }
1660
1661         return present;
1662 }
1663
1664 /* Emitters for varying records */
1665
1666 static void
1667 pan_emit_vary(struct mali_attribute_packed *out,
1668                 unsigned present, enum pan_special_varying buf,
1669                 unsigned quirks, enum mali_format format,
1670                 unsigned offset)
1671 {
1672         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1673         unsigned swizzle = quirks & HAS_SWIZZLES ?
1674                         panfrost_get_default_swizzle(nr_channels) :
1675                         panfrost_bifrost_swizzle(nr_channels);
1676
1677         pan_pack(out, ATTRIBUTE, cfg) {
1678                 cfg.buffer_index = pan_varying_index(present, buf);
1679                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1680                 cfg.format = (format << 12) | swizzle;
1681                 cfg.offset = offset;
1682         }
1683 }
1684
1685 /* General varying that is unused */
1686
1687 static void
1688 pan_emit_vary_only(struct mali_attribute_packed *out,
1689                 unsigned present, unsigned quirks)
1690 {
1691         pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1692 }
1693
1694 /* Special records */
1695
1696 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1697         [PAN_VARY_POSITION]     = MALI_VARYING_POS,
1698         [PAN_VARY_PSIZ]         = MALI_R16F,
1699         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1700         [PAN_VARY_FACE]         = MALI_R32I,
1701         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1702 };
1703
1704 static void
1705 pan_emit_vary_special(struct mali_attribute_packed *out,
1706                 unsigned present, enum pan_special_varying buf,
1707                 unsigned quirks)
1708 {
1709         assert(buf < PAN_VARY_MAX);
1710         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1711 }
1712
1713 static enum mali_format
1714 pan_xfb_format(enum mali_format format, unsigned nr)
1715 {
1716         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1717                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1718         else
1719                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1720 }
1721
1722 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1723  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1724  * value. */
1725
1726 static void
1727 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1728                 unsigned present,
1729                 unsigned max_xfb,
1730                 unsigned *streamout_offsets,
1731                 unsigned quirks,
1732                 enum mali_format format,
1733                 struct pipe_stream_output o)
1734 {
1735         unsigned swizzle = quirks & HAS_SWIZZLES ?
1736                         panfrost_get_default_swizzle(o.num_components) :
1737                         panfrost_bifrost_swizzle(o.num_components);
1738
1739         pan_pack(out, ATTRIBUTE, cfg) {
1740                 /* XFB buffers come after everything else */
1741                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1742                 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1743
1744                 /* Override number of channels and precision to highp */
1745                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1746
1747                 /* Apply given offsets together */
1748                 cfg.offset = (o.dst_offset * 4) /* dwords */
1749                         + streamout_offsets[o.output_buffer];
1750         }
1751 }
1752
1753 /* Determine if we should capture a varying for XFB. This requires actually
1754  * having a buffer for it. If we don't capture it, we'll fallback to a general
1755  * varying path (linked or unlinked, possibly discarding the write) */
1756
1757 static bool
1758 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1759                 unsigned loc, unsigned max_xfb)
1760 {
1761         if (!(xfb->so_mask & (1ll << loc)))
1762                 return false;
1763
1764         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1765         return o->output_buffer < max_xfb;
1766 }
1767
1768 static void
1769 pan_emit_general_varying(struct mali_attribute_packed *out,
1770                 struct panfrost_shader_state *other,
1771                 struct panfrost_shader_state *xfb,
1772                 gl_varying_slot loc,
1773                 enum mali_format format,
1774                 unsigned present,
1775                 unsigned quirks,
1776                 unsigned *gen_offsets,
1777                 enum mali_format *gen_formats,
1778                 unsigned *gen_stride,
1779                 unsigned idx,
1780                 bool should_alloc)
1781 {
1782         /* Check if we're linked */
1783         signed other_idx = -1;
1784
1785         for (unsigned j = 0; j < other->varying_count; ++j) {
1786                 if (other->varyings_loc[j] == loc) {
1787                         other_idx = j;
1788                         break;
1789                 }
1790         }
1791
1792         if (other_idx < 0) {
1793                 pan_emit_vary_only(out, present, quirks);
1794                 return;
1795         }
1796
1797         unsigned offset = gen_offsets[other_idx];
1798
1799         if (should_alloc) {
1800                 /* We're linked, so allocate a space via a watermark allocation */
1801                 enum mali_format alt = other->varyings[other_idx];
1802
1803                 /* Do interpolation at minimum precision */
1804                 unsigned size_main = pan_varying_size(format);
1805                 unsigned size_alt = pan_varying_size(alt);
1806                 unsigned size = MIN2(size_main, size_alt);
1807
1808                 /* If a varying is marked for XFB but not actually captured, we
1809                  * should match the format to the format that would otherwise
1810                  * be used for XFB, since dEQP checks for invariance here. It's
1811                  * unclear if this is required by the spec. */
1812
1813                 if (xfb->so_mask & (1ull << loc)) {
1814                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1815                         format = pan_xfb_format(format, o->num_components);
1816                         size = pan_varying_size(format);
1817                 } else if (size == size_alt) {
1818                         format = alt;
1819                 }
1820
1821                 gen_offsets[idx] = *gen_stride;
1822                 gen_formats[other_idx] = format;
1823                 offset = *gen_stride;
1824                 *gen_stride += size;
1825         }
1826
1827         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1828 }
1829
1830 /* Higher-level wrapper around all of the above, classifying a varying into one
1831  * of the above types */
1832
1833 static void
1834 panfrost_emit_varying(
1835                 struct mali_attribute_packed *out,
1836                 struct panfrost_shader_state *stage,
1837                 struct panfrost_shader_state *other,
1838                 struct panfrost_shader_state *xfb,
1839                 unsigned present,
1840                 unsigned max_xfb,
1841                 unsigned *streamout_offsets,
1842                 unsigned quirks,
1843                 unsigned *gen_offsets,
1844                 enum mali_format *gen_formats,
1845                 unsigned *gen_stride,
1846                 unsigned idx,
1847                 bool should_alloc,
1848                 bool is_fragment)
1849 {
1850         gl_varying_slot loc = stage->varyings_loc[idx];
1851         enum mali_format format = stage->varyings[idx];
1852
1853         /* Override format to match linkage */
1854         if (!should_alloc && gen_formats[idx])
1855                 format = gen_formats[idx];
1856
1857         if (has_point_coord(stage->point_sprite_mask, loc)) {
1858                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1859         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1860                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1861                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1862         } else if (loc == VARYING_SLOT_POS) {
1863                 if (is_fragment)
1864                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1865                 else
1866                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1867         } else if (loc == VARYING_SLOT_PSIZ) {
1868                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1869         } else if (loc == VARYING_SLOT_PNTC) {
1870                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1871         } else if (loc == VARYING_SLOT_FACE) {
1872                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1873         } else {
1874                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1875                                 quirks, gen_offsets, gen_formats, gen_stride,
1876                                 idx, should_alloc);
1877         }
1878 }
1879
1880 static void
1881 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1882                 unsigned present,
1883                 enum pan_special_varying v,
1884                 unsigned special)
1885 {
1886         if (present & (1 << v)) {
1887                 unsigned idx = pan_varying_index(present, v);
1888
1889                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1890                         cfg.special = special;
1891                         cfg.type = 0;
1892                 }
1893         }
1894 }
1895
1896 void
1897 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1898                                  unsigned vertex_count,
1899                                  struct mali_vertex_tiler_postfix *vertex_postfix,
1900                                  struct mali_vertex_tiler_postfix *tiler_postfix,
1901                                  union midgard_primitive_size *primitive_size)
1902 {
1903         /* Load the shaders */
1904         struct panfrost_context *ctx = batch->ctx;
1905         struct panfrost_device *dev = pan_device(ctx->base.screen);
1906         struct panfrost_shader_state *vs, *fs;
1907         size_t vs_size, fs_size;
1908
1909         /* Allocate the varying descriptor */
1910
1911         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1912         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1913         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1914         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1915
1916         struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1917                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1918
1919         struct pipe_stream_output_info *so = &vs->stream_output;
1920         unsigned present = pan_varying_present(vs, fs, dev->quirks);
1921
1922         /* Check if this varying is linked by us. This is the case for
1923          * general-purpose, non-captured varyings. If it is, link it. If it's
1924          * not, use the provided stream out information to determine the
1925          * offset, since it was already linked for us. */
1926
1927         unsigned gen_offsets[32];
1928         enum mali_format gen_formats[32];
1929         memset(gen_offsets, 0, sizeof(gen_offsets));
1930         memset(gen_formats, 0, sizeof(gen_formats));
1931
1932         unsigned gen_stride = 0;
1933         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1934         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1935
1936         unsigned streamout_offsets[32];
1937
1938         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1939                 streamout_offsets[i] = panfrost_streamout_offset(
1940                                         so->stride[i],
1941                                         ctx->streamout.offsets[i],
1942                                         ctx->streamout.targets[i]);
1943         }
1944
1945         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1946         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1947
1948         for (unsigned i = 0; i < vs->varying_count; i++) {
1949                 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1950                                 ctx->streamout.num_targets, streamout_offsets,
1951                                 dev->quirks,
1952                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1953         }
1954
1955         for (unsigned i = 0; i < fs->varying_count; i++) {
1956                 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1957                                 ctx->streamout.num_targets, streamout_offsets,
1958                                 dev->quirks,
1959                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1960         }
1961
1962         unsigned xfb_base = pan_xfb_base(present);
1963         struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1964                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1965                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1966         struct mali_attribute_buffer_packed *varyings =
1967                 (struct mali_attribute_buffer_packed *) T.cpu;
1968
1969         /* Emit the stream out buffers */
1970
1971         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1972                                                            ctx->vertex_count);
1973
1974         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1975                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1976                                         so->stride[i],
1977                                         ctx->streamout.offsets[i],
1978                                         out_count,
1979                                         ctx->streamout.targets[i]);
1980         }
1981
1982         panfrost_emit_varyings(batch,
1983                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1984                         gen_stride, vertex_count);
1985
1986         /* fp32 vec4 gl_Position */
1987         tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1988                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1989                         sizeof(float) * 4, vertex_count);
1990
1991         if (present & (1 << PAN_VARY_PSIZ)) {
1992                 primitive_size->pointer = panfrost_emit_varyings(batch,
1993                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1994                                 2, vertex_count);
1995         }
1996
1997         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1998         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1999         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2000
2001         vertex_postfix->varyings = T.gpu;
2002         tiler_postfix->varyings = T.gpu;
2003
2004         vertex_postfix->varying_meta = trans.gpu;
2005         tiler_postfix->varying_meta = trans.gpu + vs_size;
2006 }
2007
2008 void
2009 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2010                                 struct mali_vertex_tiler_prefix *vertex_prefix,
2011                                 struct mali_vertex_tiler_postfix *vertex_postfix,
2012                                 struct mali_vertex_tiler_prefix *tiler_prefix,
2013                                 struct mali_vertex_tiler_postfix *tiler_postfix,
2014                                 union midgard_primitive_size *primitive_size)
2015 {
2016         struct panfrost_context *ctx = batch->ctx;
2017         struct panfrost_device *device = pan_device(ctx->base.screen);
2018         bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2019         struct bifrost_payload_vertex bifrost_vertex = {0,};
2020         struct bifrost_payload_tiler bifrost_tiler = {0,};
2021         struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2022         struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2023         void *vp, *tp;
2024         size_t vp_size, tp_size;
2025
2026         if (device->quirks & IS_BIFROST) {
2027                 bifrost_vertex.prefix = *vertex_prefix;
2028                 bifrost_vertex.postfix = *vertex_postfix;
2029                 vp = &bifrost_vertex;
2030                 vp_size = sizeof(bifrost_vertex);
2031
2032                 bifrost_tiler.prefix = *tiler_prefix;
2033                 bifrost_tiler.tiler.primitive_size = *primitive_size;
2034                 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2035                 bifrost_tiler.postfix = *tiler_postfix;
2036                 tp = &bifrost_tiler;
2037                 tp_size = sizeof(bifrost_tiler);
2038         } else {
2039                 midgard_vertex.prefix = *vertex_prefix;
2040                 midgard_vertex.postfix = *vertex_postfix;
2041                 vp = &midgard_vertex;
2042                 vp_size = sizeof(midgard_vertex);
2043
2044                 midgard_tiler.prefix = *tiler_prefix;
2045                 midgard_tiler.postfix = *tiler_postfix;
2046                 midgard_tiler.primitive_size = *primitive_size;
2047                 tp = &midgard_tiler;
2048                 tp_size = sizeof(midgard_tiler);
2049         }
2050
2051         if (wallpapering) {
2052                 /* Inject in reverse order, with "predicted" job indices.
2053                  * THIS IS A HACK XXX */
2054                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2055                                  batch->scoreboard.job_index + 2, tp, tp_size, true);
2056                 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2057                                  vp, vp_size, true);
2058                 return;
2059         }
2060
2061         /* If rasterizer discard is enable, only submit the vertex */
2062
2063         unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2064                                            vp, vp_size, false);
2065
2066         if (ctx->rasterizer->base.rasterizer_discard)
2067                 return;
2068
2069         panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2070                          false);
2071 }
2072
2073 /* TODO: stop hardcoding this */
2074 mali_ptr
2075 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2076 {
2077         uint16_t locations[] = {
2078             128, 128,
2079             0, 256,
2080             0, 256,
2081             0, 256,
2082             0, 256,
2083             0, 256,
2084             0, 256,
2085             0, 256,
2086             0, 256,
2087             0, 256,
2088             0, 256,
2089             0, 256,
2090             0, 256,
2091             0, 256,
2092             0, 256,
2093             0, 256,
2094             0, 256,
2095             0, 256,
2096             0, 256,
2097             0, 256,
2098             0, 256,
2099             0, 256,
2100             0, 256,
2101             0, 256,
2102             0, 256,
2103             0, 256,
2104             0, 256,
2105             0, 256,
2106             0, 256,
2107             0, 256,
2108             0, 256,
2109             0, 256,
2110             128, 128,
2111             0, 0,
2112             0, 0,
2113             0, 0,
2114             0, 0,
2115             0, 0,
2116             0, 0,
2117             0, 0,
2118             0, 0,
2119             0, 0,
2120             0, 0,
2121             0, 0,
2122             0, 0,
2123             0, 0,
2124             0, 0,
2125             0, 0,
2126         };
2127
2128         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2129 }